From e40fbbf0572c5e41dc87ad79001748ed399ce32d Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@bytedance.com>
Date: Wed, 19 Jan 2022 11:44:40 +0000
Subject: uapi/bpf: Add missing description and returns for helper
 documentation

Both description and returns section will become mandatory
for helpers and syscalls in a later commit to generate man pages.

This commit also adds in the documentation that BPF_PROG_RUN is
an alias for BPF_PROG_TEST_RUN for anyone searching for the
syscall in the generated man pages.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220119114442.1452088-1-usama.arif@bytedance.com
---
 include/uapi/linux/bpf.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b0383d371b9a..a9c96c21330a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -330,6 +330,8 @@ union bpf_iter_link_info {
  *			*ctx_out*, *data_in* and *data_out* must be NULL.
  *			*repeat* must be zero.
  *
+ *		BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
@@ -1775,6 +1777,8 @@ union bpf_attr {
  * 		0 on success, or a negative error in case of failure.
  *
  * u64 bpf_get_current_pid_tgid(void)
+ * 	Description
+ * 		Get the current pid and tgid.
  * 	Return
  * 		A 64-bit integer containing the current tgid and pid, and
  * 		created as such:
@@ -1782,6 +1786,8 @@ union bpf_attr {
  * 		*current_task*\ **->pid**.
  *
  * u64 bpf_get_current_uid_gid(void)
+ * 	Description
+ * 		Get the current uid and gid.
  * 	Return
  * 		A 64-bit integer containing the current GID and UID, and
  * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
@@ -2256,6 +2262,8 @@ union bpf_attr {
  * 		The 32-bit hash.
  *
  * u64 bpf_get_current_task(void)
+ * 	Description
+ * 		Get the current task.
  * 	Return
  * 		A pointer to the current task struct.
  *
@@ -2369,6 +2377,8 @@ union bpf_attr {
  * 		indicate that the hash is outdated and to trigger a
  * 		recalculation the next time the kernel tries to access this
  * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ * 	Return
+ * 		void.
  *
  * long bpf_get_numa_node_id(void)
  * 	Description
@@ -2466,6 +2476,8 @@ union bpf_attr {
  * 		A 8-byte long unique number or 0 if *sk* is NULL.
  *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Description
+ * 		Get the owner UID of the socked associated to *skb*.
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
  * 		is **NULL**, or if it is not a full socket (i.e. if it is a
@@ -3240,6 +3252,9 @@ union bpf_attr {
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
  * u64 bpf_get_current_cgroup_id(void)
+ * 	Description
+ * 		Get the current cgroup id based on the cgroup within which
+ * 		the current task is running.
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
-- 
cgit v1.2.3


From b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 16 Dec 2021 02:04:27 +0000
Subject: bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall
 return value

The helpers continue to use int for retval because all the hooks
are int-returning rather than long-returning. The return value of
bpf_set_retval is int for future-proofing, in case in the future
there may be errors trying to set the retval.

After the previous patch, if a program rejects a syscall by
returning 0, an -EPERM will be generated no matter if the retval
is already set to -err. This patch change it being forced only if
retval is not -err. This is because we want to support, for
example, invoking bpf_set_retval(-EINVAL) and return 0, and have
the syscall return value be -EINVAL not -EPERM.

For BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY, the prior behavior is
that, if the return value is NET_XMIT_DROP, the packet is silently
dropped. We preserve this behavior for backward compatibility
reasons, so even if an errno is set, the errno does not return to
caller. However, setting a non-err to retval cannot propagate so
this is not allowed and we return a -EFAULT in that case.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/b4013fd5d16bed0b01977c1fafdeae12e1de61fb.1639619851.git.zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 10 ++++++----
 include/uapi/linux/bpf.h       | 18 ++++++++++++++++++
 kernel/bpf/cgroup.c            | 38 +++++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 18 ++++++++++++++++++
 4 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7b0c11f414d0..dce54eb0aae8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1299,7 +1299,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
 	while ((prog = READ_ONCE(item->prog))) {
 		run_ctx.prog_item = item;
 		func_ret = run_prog(prog, ctx);
-		if (!(func_ret & 1))
+		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
 			run_ctx.retval = -EPERM;
 		*(ret_flags) |= (func_ret >> 1);
 		item++;
@@ -1329,7 +1329,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	while ((prog = READ_ONCE(item->prog))) {
 		run_ctx.prog_item = item;
-		if (!run_prog(prog, ctx))
+		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
 			run_ctx.retval = -EPERM;
 		item++;
 	}
@@ -1389,7 +1389,7 @@ out:
  *   0: NET_XMIT_SUCCESS  skb should be transmitted
  *   1: NET_XMIT_DROP     skb should be dropped and cn
  *   2: NET_XMIT_CN       skb should be transmitted and cn
- *   3: -EPERM            skb should be dropped
+ *   3: -err              skb should be dropped
  */
 #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
 	({						\
@@ -1398,10 +1398,12 @@ out:
 		u32 _ret;				\
 		_ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \
 		_cn = _flags & BPF_RET_SET_CN;		\
+		if (_ret && !IS_ERR_VALUE((long)_ret))	\
+			_ret = -EFAULT;			\
 		if (!_ret)				\
 			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
 		else					\
-			_ret = (_cn ? NET_XMIT_DROP : -EPERM);		\
+			_ret = (_cn ? NET_XMIT_DROP : _ret);		\
 		_ret;					\
 	})
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9c96c21330a..fe2272defcd9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5033,6 +5033,22 @@ union bpf_attr {
  *
  *	Return
  *		The number of arguments of the traced function.
+ *
+ * int bpf_get_retval(void)
+ *	Description
+ *		Get the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		The syscall's return value.
+ *
+ * int bpf_set_retval(int retval)
+ *	Description
+ *		Set the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5221,6 +5237,8 @@ union bpf_attr {
 	FN(get_func_arg),		\
 	FN(get_func_ret),		\
 	FN(get_func_arg_cnt),		\
+	FN(get_retval),			\
+	FN(set_retval),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b6fad0bbf5a7..279ebbed75a5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
  *				  to call cwr
- *   -EPERM			- drop packet
+ *   -err			- drop packet
  *
  * For ingress packets, this function will return -EPERM if any
  * attached program was found and if it returned != 1 during execution.
@@ -1080,6 +1080,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	} else {
 		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
 					    __bpf_prog_run_save_cb, 0);
+		if (ret && !IS_ERR_VALUE((long)ret))
+			ret = -EFAULT;
 	}
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
@@ -1205,6 +1207,36 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	return ret;
 }
 
+BPF_CALL_0(bpf_get_retval)
+{
+	struct bpf_cg_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+	return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+	.func		= bpf_get_retval,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+	struct bpf_cg_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+	ctx->retval = retval;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_retval_proto = {
+	.func		= bpf_set_retval,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1217,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+	case BPF_FUNC_get_retval:
+		return &bpf_get_retval_proto;
+	case BPF_FUNC_set_retval:
+		return &bpf_set_retval_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a9c96c21330a..fe2272defcd9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5033,6 +5033,22 @@ union bpf_attr {
  *
  *	Return
  *		The number of arguments of the traced function.
+ *
+ * int bpf_get_retval(void)
+ *	Description
+ *		Get the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		The syscall's return value.
+ *
+ * int bpf_set_retval(int retval)
+ *	Description
+ *		Set the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5221,6 +5237,8 @@ union bpf_attr {
 	FN(get_func_arg),		\
 	FN(get_func_ret),		\
 	FN(get_func_arg_cnt),		\
+	FN(get_retval),			\
+	FN(set_retval),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From c2f2cdbeffda7b153c19e0f3d73149c41026c0db Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:52 +0100
Subject: bpf: introduce BPF_F_XDP_HAS_FRAGS flag in prog_flags loading the
 ebpf program

Introduce BPF_F_XDP_HAS_FRAGS and the related field in bpf_prog_aux
in order to notify the driver the loaded program support xdp frags.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/db2e8075b7032a356003f407d1b0deb99adaa0ed.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 1 +
 include/uapi/linux/bpf.h       | 5 +++++
 kernel/bpf/syscall.c           | 4 +++-
 tools/include/uapi/linux/bpf.h | 5 +++++
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80e3387ea3af..e93ed028a030 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -933,6 +933,7 @@ struct bpf_prog_aux {
 	bool func_proto_unreliable;
 	bool sleepable;
 	bool tail_call_reachable;
+	bool xdp_has_frags;
 	struct hlist_node tramp_hlist;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fe2272defcd9..945649c67e03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1113,6 +1113,11 @@ enum bpf_link_type {
  */
 #define BPF_F_SLEEPABLE		(1U << 4)
 
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS	(1U << 5)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9e0631f091a6..f29090643c6e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2217,7 +2217,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 				 BPF_F_ANY_ALIGNMENT |
 				 BPF_F_TEST_STATE_FREQ |
 				 BPF_F_SLEEPABLE |
-				 BPF_F_TEST_RND_HI32))
+				 BPF_F_TEST_RND_HI32 |
+				 BPF_F_XDP_HAS_FRAGS))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2303,6 +2304,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 	prog->aux->dst_prog = dst_prog;
 	prog->aux->offload_requested = !!attr->prog_ifindex;
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fe2272defcd9..945649c67e03 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1113,6 +1113,11 @@ enum bpf_link_type {
  */
 #define BPF_F_SLEEPABLE		(1U << 4)
 
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS	(1U << 5)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
-- 
cgit v1.2.3


From 0165cc817075cf701e4289838f1d925ff1911b3e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:54 +0100
Subject: bpf: introduce bpf_xdp_get_buff_len helper

Introduce bpf_xdp_get_buff_len helper in order to return the xdp buffer
total size (linear and paged area)

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/aac9ac3504c84026cf66a3c71b7c5ae89bc991be.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h              | 14 ++++++++++++++
 include/uapi/linux/bpf.h       |  7 +++++++
 net/core/filter.c              | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 8463dea8b4db..52b593321956 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -145,6 +145,20 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
 	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
 }
 
+static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp)
+{
+	unsigned int len = xdp->data_end - xdp->data;
+	struct skb_shared_info *sinfo;
+
+	if (likely(!xdp_buff_has_frags(xdp)))
+		goto out;
+
+	sinfo = xdp_get_shared_info_from_buff(xdp);
+	len += sinfo->xdp_frags_size;
+out:
+	return len;
+}
+
 struct xdp_frame {
 	void *data;
 	u16 len;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 945649c67e03..5a28772063f6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5054,6 +5054,12 @@ union bpf_attr {
  *		This helper is currently supported by cgroup programs only.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ *	Description
+ *		Get the total size of a given xdp buff (linear and paged area)
+ *	Return
+ *		The total size of a given xdp buffer.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5244,6 +5250,7 @@ union bpf_attr {
 	FN(get_func_arg_cnt),		\
 	FN(get_retval),			\
 	FN(set_retval),			\
+	FN(xdp_get_buff_len),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index f73a84c75970..a7f03bbca465 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3783,6 +3783,19 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct  xdp_buff*, xdp)
+{
+	return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+	.func		= bpf_xdp_get_buff_len,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
 {
 	return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -7533,6 +7546,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_xdp_get_buff_len:
+		return &bpf_xdp_get_buff_len_proto;
 	case BPF_FUNC_fib_lookup:
 		return &bpf_xdp_fib_lookup_proto;
 	case BPF_FUNC_check_mtu:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 945649c67e03..5a28772063f6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5054,6 +5054,12 @@ union bpf_attr {
  *		This helper is currently supported by cgroup programs only.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ *	Description
+ *		Get the total size of a given xdp buff (linear and paged area)
+ *	Return
+ *		The total size of a given xdp buffer.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5244,6 +5250,7 @@ union bpf_attr {
 	FN(get_func_arg_cnt),		\
 	FN(get_retval),			\
 	FN(set_retval),			\
+	FN(xdp_get_buff_len),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 3f364222d032eea6b245780e845ad213dab28cdd Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:10:03 +0100
Subject: net: xdp: introduce bpf_xdp_pointer utility routine

Similar to skb_header_pointer, introduce bpf_xdp_pointer utility routine
to return a pointer to a given position in the xdp_buff if the requested
area (offset + len) is contained in a contiguous memory area otherwise it
will be copied in a bounce buffer provided by the caller.
Similar to the tc counterpart, introduce the two following xdp helpers:
- bpf_xdp_load_bytes
- bpf_xdp_store_bytes

Reviewed-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/ab285c1efdd5b7a9d361348b1e7d3ef49f6382b3.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  18 +++++
 net/core/filter.c              | 176 ++++++++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h |  18 +++++
 3 files changed, 174 insertions(+), 38 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5a28772063f6..16a7574292a5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5060,6 +5060,22 @@ union bpf_attr {
  *		Get the total size of a given xdp buff (linear and paged area)
  *	Return
  *		The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		This helper is provided as an easy way to load data from a
+ *		xdp buffer. It can be used to load *len* bytes from *offset* from
+ *		the frame associated to *xdp_md*, into the buffer pointed by
+ *		*buf*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		Store *len* bytes from buffer *buf* into the frame
+ *		associated to *xdp_md*, at *offset*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5251,6 +5267,8 @@ union bpf_attr {
 	FN(get_retval),			\
 	FN(set_retval),			\
 	FN(xdp_get_buff_len),		\
+	FN(xdp_load_bytes),		\
+	FN(xdp_store_bytes),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index e4ce138bf925..945ccaaab3cb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3839,6 +3839,138 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+			     void *buf, unsigned long len, bool flush)
+{
+	unsigned long ptr_len, ptr_off = 0;
+	skb_frag_t *next_frag, *end_frag;
+	struct skb_shared_info *sinfo;
+	void *src, *dst;
+	u8 *ptr_buf;
+
+	if (likely(xdp->data_end - xdp->data >= off + len)) {
+		src = flush ? buf : xdp->data + off;
+		dst = flush ? xdp->data + off : buf;
+		memcpy(dst, src, len);
+		return;
+	}
+
+	sinfo = xdp_get_shared_info_from_buff(xdp);
+	end_frag = &sinfo->frags[sinfo->nr_frags];
+	next_frag = &sinfo->frags[0];
+
+	ptr_len = xdp->data_end - xdp->data;
+	ptr_buf = xdp->data;
+
+	while (true) {
+		if (off < ptr_off + ptr_len) {
+			unsigned long copy_off = off - ptr_off;
+			unsigned long copy_len = min(len, ptr_len - copy_off);
+
+			src = flush ? buf : ptr_buf + copy_off;
+			dst = flush ? ptr_buf + copy_off : buf;
+			memcpy(dst, src, copy_len);
+
+			off += copy_len;
+			len -= copy_len;
+			buf += copy_len;
+		}
+
+		if (!len || next_frag == end_frag)
+			break;
+
+		ptr_off += ptr_len;
+		ptr_buf = skb_frag_address(next_frag);
+		ptr_len = skb_frag_size(next_frag);
+		next_frag++;
+	}
+}
+
+static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+	u32 size = xdp->data_end - xdp->data;
+	void *addr = xdp->data;
+	int i;
+
+	if (unlikely(offset > 0xffff || len > 0xffff))
+		return ERR_PTR(-EFAULT);
+
+	if (offset + len > xdp_get_buff_len(xdp))
+		return ERR_PTR(-EINVAL);
+
+	if (offset < size) /* linear area */
+		goto out;
+
+	offset -= size;
+	for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+		u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+		if  (offset < frag_size) {
+			addr = skb_frag_address(&sinfo->frags[i]);
+			size = frag_size;
+			break;
+		}
+		offset -= frag_size;
+	}
+out:
+	return offset + len < size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+	   void *, buf, u32, len)
+{
+	void *ptr;
+
+	ptr = bpf_xdp_pointer(xdp, offset, len);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	if (!ptr)
+		bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+	else
+		memcpy(buf, ptr, len);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+	.func		= bpf_xdp_load_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+	   void *, buf, u32, len)
+{
+	void *ptr;
+
+	ptr = bpf_xdp_pointer(xdp, offset, len);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	if (!ptr)
+		bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+	else
+		memcpy(ptr, buf, len);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+	.func		= bpf_xdp_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -4677,48 +4809,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
 };
 #endif
 
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *ctx,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
 				  unsigned long off, unsigned long len)
 {
 	struct xdp_buff *xdp = (struct xdp_buff *)ctx;
-	unsigned long ptr_len, ptr_off = 0;
-	skb_frag_t *next_frag, *end_frag;
-	struct skb_shared_info *sinfo;
-	u8 *ptr_buf;
-
-	if (likely(xdp->data_end - xdp->data >= off + len)) {
-		memcpy(dst_buff, xdp->data + off, len);
-		return 0;
-	}
-
-	sinfo = xdp_get_shared_info_from_buff(xdp);
-	end_frag = &sinfo->frags[sinfo->nr_frags];
-	next_frag = &sinfo->frags[0];
-
-	ptr_len = xdp->data_end - xdp->data;
-	ptr_buf = xdp->data;
-
-	while (true) {
-		if (off < ptr_off + ptr_len) {
-			unsigned long copy_off = off - ptr_off;
-			unsigned long copy_len = min(len, ptr_len - copy_off);
-
-			memcpy(dst_buff, ptr_buf + copy_off, copy_len);
-
-			off += copy_len;
-			len -= copy_len;
-			dst_buff += copy_len;
-		}
-
-		if (!len || next_frag == end_frag)
-			break;
-
-		ptr_off += ptr_len;
-		ptr_buf = skb_frag_address(next_frag);
-		ptr_len = skb_frag_size(next_frag);
-		next_frag++;
-	}
 
+	bpf_xdp_copy_buf(xdp, off, dst, len, false);
 	return 0;
 }
 
@@ -7660,6 +7756,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_adjust_tail_proto;
 	case BPF_FUNC_xdp_get_buff_len:
 		return &bpf_xdp_get_buff_len_proto;
+	case BPF_FUNC_xdp_load_bytes:
+		return &bpf_xdp_load_bytes_proto;
+	case BPF_FUNC_xdp_store_bytes:
+		return &bpf_xdp_store_bytes_proto;
 	case BPF_FUNC_fib_lookup:
 		return &bpf_xdp_fib_lookup_proto;
 	case BPF_FUNC_check_mtu:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5a28772063f6..16a7574292a5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5060,6 +5060,22 @@ union bpf_attr {
  *		Get the total size of a given xdp buff (linear and paged area)
  *	Return
  *		The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		This helper is provided as an easy way to load data from a
+ *		xdp buffer. It can be used to load *len* bytes from *offset* from
+ *		the frame associated to *xdp_md*, into the buffer pointed by
+ *		*buf*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		Store *len* bytes from buffer *buf* into the frame
+ *		associated to *xdp_md*, at *offset*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5251,6 +5267,8 @@ union bpf_attr {
 	FN(get_retval),			\
 	FN(set_retval),			\
 	FN(xdp_get_buff_len),		\
+	FN(xdp_load_bytes),		\
+	FN(xdp_store_bytes),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 376040e47334c6dc6a939a32197acceb00fe4acf Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Mon, 24 Jan 2022 10:54:01 -0800
Subject: bpf: Add bpf_copy_from_user_task() helper

This adds a helper for bpf programs to read the memory of other
tasks.

As an example use case at Meta, we are using a bpf task iterator program
and this new helper to print C++ async stack traces for all threads of
a given process.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220124185403.468466-3-kennyyu@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 11 +++++++++++
 kernel/bpf/helpers.c           | 34 ++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 11 +++++++++++
 5 files changed, 59 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8c92c974bd12..394305a5e02f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_strncmp_proto;
+extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 16a7574292a5..4a2f7041ebae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5076,6 +5076,16 @@ union bpf_attr {
  *		associated to *xdp_md*, at *offset*.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ *	Description
+ *		Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ *		address space, and stores the data in *dst*. *flags* is not
+ *		used yet and is provided for future extensibility. This helper
+ *		can only be used by sleepable programs.
+ *	Return
+ *		0 on success, or a negative error in case of failure. On error
+ *		*dst* buffer is zeroed out.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5269,6 +5279,7 @@ union bpf_attr {
 	FN(xdp_get_buff_len),		\
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
+	FN(copy_from_user_task),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01cfdf40c838..ed2780b76cc1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/security.h>
+#include <linux/btf_ids.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+	int ret;
+
+	/* flags is not used yet */
+	if (unlikely(flags))
+		return -EINVAL;
+
+	if (unlikely(!size))
+		return 0;
+
+	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+	if (ret == size)
+		return 0;
+
+	memset(dst, 0, size);
+	/* Return -EFAULT for partial read */
+	return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+	.func		= bpf_copy_from_user_task,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_BTF_ID,
+	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+	.arg5_type	= ARG_ANYTHING
+};
+
 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
 {
 	if (cpu >= nr_cpu_ids)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 06a9e220069e..a2024ba32a20 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_task_stack_proto;
 	case BPF_FUNC_copy_from_user:
 		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+	case BPF_FUNC_copy_from_user_task:
+		return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_per_cpu_ptr:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 16a7574292a5..4a2f7041ebae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5076,6 +5076,16 @@ union bpf_attr {
  *		associated to *xdp_md*, at *offset*.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ *	Description
+ *		Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ *		address space, and stores the data in *dst*. *flags* is not
+ *		used yet and is provided for future extensibility. This helper
+ *		can only be used by sleepable programs.
+ *	Return
+ *		0 on success, or a negative error in case of failure. On error
+ *		*dst* buffer is zeroed out.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5269,6 +5279,7 @@ union bpf_attr {
 	FN(xdp_get_buff_len),		\
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
+	FN(copy_from_user_task),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 9690ae60429020f38e4aa2540c306f27eb021bc0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 27 Jan 2022 10:42:59 -0800
Subject: ethtool: add header/data split indication

For applications running on a mix of platforms it's useful
to have a clear indication whether host's NIC supports the
geometry requirements of TCP zero-copy. TCP zero-copy Rx
requires data to be neatly placed into memory pages.
Most NICs can't do that.

This patch is adding GET support only, since the NICs
I work with either always have the feature enabled or
enable it whenever MTU is set to jumbo. In other words
I don't need SET. But adding set should be trivial.
(The only note on SET is that we will likely want
the setting to be "sticky" and use 0 / `unknown`
to reset it back to driver default.)

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  8 ++++++++
 include/linux/ethtool.h                      |  2 ++
 include/uapi/linux/ethtool_netlink.h         |  7 +++++++
 net/ethtool/rings.c                          | 15 ++++++++++-----
 4 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 9d98e0511249..cae28af7a476 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -860,8 +860,16 @@ Kernel response contents:
   ``ETHTOOL_A_RINGS_RX_JUMBO``          u32     size of RX jumbo ring
   ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
+  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
   ====================================  ======  ===========================
 
+``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
+page-flipping TCP zero-copy receive (``getsockopt(TCP_ZEROCOPY_RECEIVE)``).
+If enabled the device is configured to place frame headers and data into
+separate buffers. The device configuration must make it possible to receive
+full memory pages of data, for example because MTU is high enough or through
+HW-GRO.
+
 
 RINGS_SET
 =========
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 11efc45de66a..e0853f48b75e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -70,9 +70,11 @@ enum {
 /**
  * struct kernel_ethtool_ringparam - RX/TX ring configuration
  * @rx_buf_len: Current length of buffers on the rx ring.
+ * @tcp_data_split: Scatter packet headers and data to separate buffers
  */
 struct kernel_ethtool_ringparam {
 	u32	rx_buf_len;
+	u8	tcp_data_split;
 };
 
 /**
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index cca6e474a085..417d4280d7b5 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -318,6 +318,12 @@ enum {
 
 /* RINGS */
 
+enum {
+	ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0,
+	ETHTOOL_TCP_DATA_SPLIT_DISABLED,
+	ETHTOOL_TCP_DATA_SPLIT_ENABLED,
+};
+
 enum {
 	ETHTOOL_A_RINGS_UNSPEC,
 	ETHTOOL_A_RINGS_HEADER,				/* nest - _A_HEADER_* */
@@ -330,6 +336,7 @@ enum {
 	ETHTOOL_A_RINGS_RX_JUMBO,			/* u32 */
 	ETHTOOL_A_RINGS_TX,				/* u32 */
 	ETHTOOL_A_RINGS_RX_BUF_LEN,                     /* u32 */
+	ETHTOOL_A_RINGS_TCP_DATA_SPLIT,			/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_RINGS_CNT,
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index c1d5f5e0fdc9..18a5035d3bee 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -53,7 +53,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_MINI */
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_JUMBO */
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_TX */
-	       nla_total_size(sizeof(u32));     /* _RINGS_RX_BUF_LEN */
+	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_BUF_LEN */
+	       nla_total_size(sizeof(u8));	/* _RINGS_TCP_DATA_SPLIT */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -61,9 +62,11 @@ static int rings_fill_reply(struct sk_buff *skb,
 			    const struct ethnl_reply_data *reply_base)
 {
 	const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
-	const struct kernel_ethtool_ringparam *kernel_ringparam = &data->kernel_ringparam;
+	const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
 	const struct ethtool_ringparam *ringparam = &data->ringparam;
 
+	WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
+
 	if ((ringparam->rx_max_pending &&
 	     (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
 			  ringparam->rx_max_pending) ||
@@ -84,9 +87,11 @@ static int rings_fill_reply(struct sk_buff *skb,
 			  ringparam->tx_max_pending) ||
 	      nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
 			  ringparam->tx_pending)))  ||
-	    (kernel_ringparam->rx_buf_len &&
-	     (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN,
-			  kernel_ringparam->rx_buf_len))))
+	    (kr->rx_buf_len &&
+	     (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
+	    (kr->tcp_data_split &&
+	     (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
+			 kr->tcp_data_split))))
 		return -EMSGSIZE;
 
 	return 0;
-- 
cgit v1.2.3


From b2a90f4fcb146d0e033203ab646f0fd22cfa947f Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Thu, 13 Jan 2022 11:20:22 +0100
Subject: media: lirc: remove unused lirc features

These features have never been implemented by any lirc driver, including
staging or out of tree drivers. The ioctls for these feaures were removed
in commit d55f09abe24b ("[media] lirc.h: remove several unused ioctls").

So, we can safely remove them.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/rc/lirc-get-features.rst       | 18 ------------------
 include/uapi/linux/lirc.h                              |  4 ----
 2 files changed, 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/rc/lirc-get-features.rst b/Documentation/userspace-api/media/rc/lirc-get-features.rst
index 4bf25860f932..545137620ead 100644
--- a/Documentation/userspace-api/media/rc/lirc-get-features.rst
+++ b/Documentation/userspace-api/media/rc/lirc-get-features.rst
@@ -102,12 +102,6 @@ LIRC features
     The driver supports setting the receive carrier frequency using
     :ref:`ioctl LIRC_SET_REC_CARRIER <LIRC_SET_REC_CARRIER>`.
 
-.. _LIRC-CAN-SET-REC-DUTY-CYCLE-RANGE:
-
-``LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE``
-
-    Unused. Kept just to avoid breaking uAPI.
-
 .. _LIRC-CAN-SET-REC-CARRIER-RANGE:
 
 ``LIRC_CAN_SET_REC_CARRIER_RANGE``
@@ -129,12 +123,6 @@ LIRC features
     The driver supports
     :ref:`ioctl LIRC_SET_REC_TIMEOUT <LIRC_SET_REC_TIMEOUT>`.
 
-.. _LIRC-CAN-SET-REC-FILTER:
-
-``LIRC_CAN_SET_REC_FILTER``
-
-    Unused. Kept just to avoid breaking uAPI.
-
 .. _LIRC-CAN-MEASURE-CARRIER:
 
 ``LIRC_CAN_MEASURE_CARRIER``
@@ -149,12 +137,6 @@ LIRC features
     The driver supports learning mode using
     :ref:`ioctl LIRC_SET_WIDEBAND_RECEIVER <LIRC_SET_WIDEBAND_RECEIVER>`.
 
-.. _LIRC-CAN-NOTIFY-DECODE:
-
-``LIRC_CAN_NOTIFY_DECODE``
-
-    Unused. Kept just to avoid breaking uAPI.
-
 .. _LIRC-CAN-SEND-RAW:
 
 ``LIRC_CAN_SEND_RAW``
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 9919f2062b14..a1f9c26ea537 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -72,11 +72,9 @@
 #define LIRC_CAN_SET_REC_CARRIER       (LIRC_CAN_SET_SEND_CARRIER << 16)
 #define LIRC_CAN_SET_REC_DUTY_CYCLE    (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16)
 
-#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000
 #define LIRC_CAN_SET_REC_CARRIER_RANGE    0x80000000
 #define LIRC_CAN_GET_REC_RESOLUTION       0x20000000
 #define LIRC_CAN_SET_REC_TIMEOUT          0x10000000
-#define LIRC_CAN_SET_REC_FILTER           0x08000000
 
 #define LIRC_CAN_MEASURE_CARRIER          0x02000000
 #define LIRC_CAN_USE_WIDEBAND_RECEIVER    0x04000000
@@ -84,8 +82,6 @@
 #define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK)
 #define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK)
 
-#define LIRC_CAN_NOTIFY_DECODE            0x01000000
-
 /*** IOCTL commands for lirc driver ***/
 
 #define LIRC_GET_FEATURES              _IOR('i', 0x00000000, __u32)
-- 
cgit v1.2.3


From 68a99f6a0ebfe9101ea79ba5af1c407a5ad4f629 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sat, 15 Jan 2022 11:19:11 +0100
Subject: media: lirc: report ir receiver overflow

If the driver reports that the hardware had an overflow, report this to
userspace. It would be nice to know when this happens, and not just get
a long space.

This change has been tested with lircd, ir-ctl, and ir-keytable.

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/lirc.h.rst.exceptions |  3 +++
 Documentation/userspace-api/media/rc/lirc-dev-intro.rst | 11 +++++++++--
 drivers/media/rc/lirc_dev.c                             | 13 ++++++-------
 drivers/media/rc/rc-loopback.c                          |  6 +++++-
 include/uapi/linux/lirc.h                               | 11 +++++++----
 5 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/lirc.h.rst.exceptions b/Documentation/userspace-api/media/lirc.h.rst.exceptions
index ec86e82d026d..5f31e967bc50 100644
--- a/Documentation/userspace-api/media/lirc.h.rst.exceptions
+++ b/Documentation/userspace-api/media/lirc.h.rst.exceptions
@@ -11,12 +11,14 @@ ignore define LIRC_SPACE
 ignore define LIRC_PULSE
 ignore define LIRC_FREQUENCY
 ignore define LIRC_TIMEOUT
+ignore define LIRC_OVERFLOW
 ignore define LIRC_VALUE
 ignore define LIRC_MODE2
 ignore define LIRC_IS_SPACE
 ignore define LIRC_IS_PULSE
 ignore define LIRC_IS_FREQUENCY
 ignore define LIRC_IS_TIMEOUT
+ignore define LIRC_IS_OVERFLOW
 
 ignore define LIRC_MODE2SEND
 ignore define LIRC_SEND2MODE
@@ -75,6 +77,7 @@ ignore define PULSE_MASK
 ignore define LIRC_MODE2_SPACE
 ignore define LIRC_MODE2_PULSE
 ignore define LIRC_MODE2_TIMEOUT
+ignore define LIRC_MODE2_OVERFLOW
 
 ignore define LIRC_VALUE_MASK
 ignore define LIRC_MODE2_MASK
diff --git a/Documentation/userspace-api/media/rc/lirc-dev-intro.rst b/Documentation/userspace-api/media/rc/lirc-dev-intro.rst
index 9a5e5f0aae11..d899331b943f 100644
--- a/Documentation/userspace-api/media/rc/lirc-dev-intro.rst
+++ b/Documentation/userspace-api/media/rc/lirc-dev-intro.rst
@@ -103,11 +103,11 @@ on the following table.
 
     ``LIRC_MODE2_PULSE``
 
-        Signifies the presence of IR in microseconds.
+        Signifies the presence of IR in microseconds, also known as *flash*.
 
     ``LIRC_MODE2_SPACE``
 
-        Signifies absence of IR in microseconds.
+        Signifies absence of IR in microseconds, also known as *gap*.
 
     ``LIRC_MODE2_FREQUENCY``
 
@@ -121,6 +121,13 @@ on the following table.
         to no IR being detected, this packet will be sent, with the number
         of microseconds with no IR.
 
+    ``LIRC_MODE2_OVERFLOW``
+
+        Signifies that the IR receiver encounter an overflow, and some IR
+        is missing. The IR data after this should be correct again. The
+        actual value is not important, but this is set to 0xffffff by the
+        kernel for compatibility with lircd.
+
 .. _lirc-mode-pulse:
 
 ``LIRC_MODE_PULSE``
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index fa4671fc92be..765375bda0c6 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -44,14 +44,13 @@ void lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 	/* Receiver overflow, data missing */
 	if (ev.overflow) {
 		/*
-		 * Userspace expects a long space event before the start of
-		 * the signal to use as a sync.  This may be done with repeat
-		 * packets and normal samples.  But if an overflow has been sent
-		 * then we assume that a long time has passed, so we send a
-		 * space with the maximum time value.
+		 * Send lirc overflow message. This message is unknown to
+		 * lircd, but it will interpret this as a long space as
+		 * long as the value is set to high value. This resets its
+		 * decoder state.
 		 */
-		sample = LIRC_SPACE(LIRC_VALUE_MASK);
-		dev_dbg(&dev->dev, "delivering overflow space to lirc_dev\n");
+		sample = LIRC_OVERFLOW(LIRC_VALUE_MASK);
+		dev_dbg(&dev->dev, "delivering overflow to lirc_dev\n");
 
 	/* Carrier reports */
 	} else if (ev.carrier_report) {
diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c
index 6441879fcba1..b356041c5c00 100644
--- a/drivers/media/rc/rc-loopback.c
+++ b/drivers/media/rc/rc-loopback.c
@@ -112,7 +112,11 @@ static int loop_tx_ir(struct rc_dev *dev, unsigned *txbuf, unsigned count)
 		rawir.pulse = i % 2 ? false : true;
 		rawir.duration = txbuf[i];
 
-		ir_raw_event_store_with_filter(dev, &rawir);
+		/* simulate overflow if ridiculously long pulse was sent */
+		if (rawir.pulse && rawir.duration > MS_TO_US(50))
+			ir_raw_event_overflow(dev);
+		else
+			ir_raw_event_store_with_filter(dev, &rawir);
 	}
 
 	if (lodev->carrierreport) {
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index a1f9c26ea537..21c69a6a100d 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -16,14 +16,16 @@
 #define LIRC_MODE2_PULSE     0x01000000
 #define LIRC_MODE2_FREQUENCY 0x02000000
 #define LIRC_MODE2_TIMEOUT   0x03000000
+#define LIRC_MODE2_OVERFLOW  0x04000000
 
 #define LIRC_VALUE_MASK      0x00FFFFFF
 #define LIRC_MODE2_MASK      0xFF000000
 
-#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE)
-#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE)
-#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY)
-#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT)
+#define LIRC_SPACE(val) (((val) & LIRC_VALUE_MASK) | LIRC_MODE2_SPACE)
+#define LIRC_PULSE(val) (((val) & LIRC_VALUE_MASK) | LIRC_MODE2_PULSE)
+#define LIRC_FREQUENCY(val) (((val) & LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY)
+#define LIRC_TIMEOUT(val) (((val) & LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT)
+#define LIRC_OVERFLOW(val) (((val) & LIRC_VALUE_MASK) | LIRC_MODE2_OVERFLOW)
 
 #define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK)
 #define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK)
@@ -32,6 +34,7 @@
 #define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE)
 #define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY)
 #define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT)
+#define LIRC_IS_OVERFLOW(val) (LIRC_MODE2(val) == LIRC_MODE2_OVERFLOW)
 
 /* used heavily by lirc userspace */
 #define lirc_t int
-- 
cgit v1.2.3


From f6c6804c43fa18d3cee64b55490dfbd3bef1363a Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Fri, 28 Jan 2022 15:40:25 +0000
Subject: kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h

This way we can more easily find the next free IOCTL number when
adding new IOCTLs.

Fixes: be50b2065dfa ("kvm: x86: Add support for getting/setting expanded xstate buffer")
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-Id: <20220128154025.102666-1-frankja@linux.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/kvm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b46bcdb0cab1..5191b57e1562 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1624,9 +1624,6 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 
-/* Available with KVM_CAP_XSAVE2 */
-#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
-
 struct kvm_s390_pv_sec_parm {
 	__u64 origin;
 	__u64 length;
@@ -2048,4 +2045,7 @@ struct kvm_stats_desc {
 
 #define KVM_GET_STATS_FD  _IO(KVMIO,  0xce)
 
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3


From e187013abeb4c2a7ec8a4bb978844c7e92a1a6ec Mon Sep 17 00:00:00 2001
From: Akhmat Karakotov <hmukos@yandex-team.ru>
Date: Mon, 31 Jan 2022 16:31:21 +0300
Subject: txhash: Make rethinking txhash behavior configurable via sysctl

Add a per ns sysctl that controls the txhash rethink behavior:
net.core.txrehash. When enabled, the same behavior is retained,
when disabled, rethink is not performed. Sysctl is enabled by default.

Signed-off-by: Akhmat Karakotov <hmukos@yandex-team.ru>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/core.h    |  1 +
 include/net/sock.h          | 34 +++++++++++++++++++++-------------
 include/uapi/linux/socket.h |  3 +++
 net/core/net_namespace.c    |  2 ++
 net/core/sysctl_net_core.c  | 14 ++++++++++++--
 5 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 552bc25b1933..388244e315e7 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -10,6 +10,7 @@ struct netns_core {
 	struct ctl_table_header	*sysctl_hdr;
 
 	int	sysctl_somaxconn;
+	u8	sysctl_txrehash;
 
 #ifdef CONFIG_PROC_FS
 	struct prot_inuse __percpu *prot_inuse;
diff --git a/include/net/sock.h b/include/net/sock.h
index ff9b508d9c5f..0540e1b2aeb0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -587,6 +587,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 			   __tmp | SK_USER_DATA_NOCOPY);		\
 })
 
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+	return read_pnet(&sk->sk_net);
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+	write_pnet(&sk->sk_net, net);
+}
+
 /*
  * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
  * or not whether his port will be reused by someone else. SK_FORCE_REUSE
@@ -2054,10 +2066,18 @@ static inline void sk_set_txhash(struct sock *sk)
 
 static inline bool sk_rethink_txhash(struct sock *sk)
 {
-	if (sk->sk_txhash) {
+	u8 rehash;
+
+	if (!sk->sk_txhash)
+		return false;
+
+	rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
+	if (rehash) {
 		sk_set_txhash(sk);
 		return true;
 	}
+
 	return false;
 }
 
@@ -2704,18 +2724,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-static inline
-struct net *sock_net(const struct sock *sk)
-{
-	return read_pnet(&sk->sk_net);
-}
-
-static inline
-void sock_net_set(struct sock *sk, struct net *net)
-{
-	write_pnet(&sk->sk_net, net);
-}
-
 static inline bool
 skb_sk_is_prefetched(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index eb0a9a5b6e71..0accd6102ece 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -31,4 +31,7 @@ struct __kernel_sockaddr_storage {
 
 #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
 
+#define SOCK_TXREHASH_DISABLED	0
+#define SOCK_TXREHASH_ENABLED	1
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c53d9aab38ab..8711350085d6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -364,6 +364,8 @@ out_undo:
 static int __net_init net_defaults_init_net(struct net *net)
 {
 	net->core.sysctl_somaxconn = SOMAXCONN;
+	net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7b4d485aac7a..dbeb8ecbcd98 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.proc_handler	= proc_dointvec_minmax
 	},
+	{
+		.procname	= "txrehash",
+		.data		= &init_net.core.sysctl_txrehash,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
 	{ }
 };
 
@@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
 
 static __net_init int sysctl_core_net_init(struct net *net)
 {
-	struct ctl_table *tbl;
+	struct ctl_table *tbl, *tmp;
 
 	tbl = netns_core_table;
 	if (!net_eq(net, &init_net)) {
@@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
 		if (tbl == NULL)
 			goto err_dup;
 
-		tbl[0].data = &net->core.sysctl_somaxconn;
+		for (tmp = tbl; tmp->procname; tmp++)
+			tmp->data += (char *)net - (char *)&init_net;
 
 		/* Don't export any sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns) {
-- 
cgit v1.2.3


From 26859240e4ee701e0379f08634957adaff67e43a Mon Sep 17 00:00:00 2001
From: Akhmat Karakotov <hmukos@yandex-team.ru>
Date: Mon, 31 Jan 2022 16:31:22 +0300
Subject: txhash: Add socket option to control TX hash rethink behavior

Add the SO_TXREHASH socket option to control hash rethink behavior per socket.
When default mode is set, sockets disable rehash at initialization and use
sysctl option when entering listen state. setsockopt() overrides default
behavior.

Signed-off-by: Akhmat Karakotov <hmukos@yandex-team.ru>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 ++
 arch/mips/include/uapi/asm/socket.h   |  2 ++
 arch/parisc/include/uapi/asm/socket.h |  2 ++
 arch/sparc/include/uapi/asm/socket.h  |  2 ++
 include/net/sock.h                    | 12 +++---------
 include/uapi/asm-generic/socket.h     |  2 ++
 include/uapi/linux/socket.h           |  1 +
 net/core/sock.c                       | 13 +++++++++++++
 net/ipv4/inet_connection_sock.c       |  3 +++
 9 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 284d28755b8d..7d81535893af 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -133,6 +133,8 @@
 
 #define SO_RESERVE_MEM		73
 
+#define SO_TXREHASH		74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 24e0efb360f6..1d55e57b8466 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -144,6 +144,8 @@
 
 #define SO_RESERVE_MEM		73
 
+#define SO_TXREHASH		74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 845ddc63c882..654061e0964e 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -125,6 +125,8 @@
 
 #define SO_RESERVE_MEM		0x4047
 
+#define SO_TXREHASH		0x4048
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 2672dd03faf3..666f81e617ea 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -126,6 +126,8 @@
 
 #define SO_RESERVE_MEM           0x0052
 
+#define SO_TXREHASH              0x0053
+
 
 #if !defined(__KERNEL__)
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 0540e1b2aeb0..d6c13f0fba40 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -316,6 +316,7 @@ struct sk_filter;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_txhash: computed flow hash for use on transmit
+  *	@sk_txrehash: enable TX hash rethink
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
@@ -491,6 +492,7 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+	u8			sk_txrehash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	u8			sk_prefer_busy_poll;
 	u16			sk_busy_poll_budget;
@@ -2066,18 +2068,10 @@ static inline void sk_set_txhash(struct sock *sk)
 
 static inline bool sk_rethink_txhash(struct sock *sk)
 {
-	u8 rehash;
-
-	if (!sk->sk_txhash)
-		return false;
-
-	rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
-
-	if (rehash) {
+	if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
 		sk_set_txhash(sk);
 		return true;
 	}
-
 	return false;
 }
 
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index c77a1313b3b0..467ca2f28760 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -128,6 +128,8 @@
 
 #define SO_RESERVE_MEM		73
 
+#define SO_TXREHASH		74
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index 0accd6102ece..51d6bb2f6765 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -31,6 +31,7 @@ struct __kernel_sockaddr_storage {
 
 #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
 
+#define SOCK_TXREHASH_DEFAULT	((u8)-1)
 #define SOCK_TXREHASH_DISABLED	0
 #define SOCK_TXREHASH_ENABLED	1
 
diff --git a/net/core/sock.c b/net/core/sock.c
index cccf21f3618d..5e711b42898f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1447,6 +1447,14 @@ set_sndbuf:
 		break;
 	}
 
+	case SO_TXREHASH:
+		if (val < -1 || val > 1) {
+			ret = -EINVAL;
+			break;
+		}
+		sk->sk_txrehash = (u8)val;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1834,6 +1842,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_reserved_mem;
 		break;
 
+	case SO_TXREHASH:
+		v.val = sk->sk_txrehash;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
@@ -3279,6 +3291,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_pacing_rate = ~0UL;
 	WRITE_ONCE(sk->sk_pacing_shift, 10);
 	sk->sk_incoming_cpu = -1;
+	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
 
 	sk_rx_queue_clear(sk);
 	/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc2a985f6064..b81fb13fc5f4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk)
 	sk->sk_ack_backlog = 0;
 	inet_csk_delack_init(sk);
 
+	if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+		sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
 	/* There is race window here: we announce ourselves listening,
 	 * but this transition is still not validated by get_port().
 	 * It is OK, because this socket enters to hash table only
-- 
cgit v1.2.3


From 4421a582718ab81608d8486734c18083b822390d Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Sun, 30 Jan 2022 12:55:17 +0100
Subject: bpf: Make dst_port field in struct bpf_sock 16-bit wide

Menglong Dong reports that the documentation for the dst_port field in
struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the
field is a zero-padded 16-bit integer in network byte order. The value
appears to the BPF user as if laid out in memory as so:

  offsetof(struct bpf_sock, dst_port) + 0  <port MSB>
                                      + 8  <port LSB>
                                      +16  0x00
                                      +24  0x00

32-, 16-, and 8-bit wide loads from the field are all allowed, but only if
the offset into the field is 0.

32-bit wide loads from dst_port are especially confusing. The loaded value,
after converting to host byte order with bpf_ntohl(dst_port), contains the
port number in the upper 16-bits.

Remove the confusion by splitting the field into two 16-bit fields. For
backward compatibility, allow 32-bit wide loads from offsetof(struct
bpf_sock, dst_port).

While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port.

Reported-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 ++-
 net/core/filter.c        | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4a2f7041ebae..a7f0ddedac1f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5574,7 +5574,8 @@ struct bpf_sock {
 	__u32 src_ip4;
 	__u32 src_ip6[4];
 	__u32 src_port;		/* host byte order */
-	__u32 dst_port;		/* network byte order */
+	__be16 dst_port;	/* network byte order */
+	__u16 :16;		/* zero padding */
 	__u32 dst_ip4;
 	__u32 dst_ip6[4];
 	__u32 state;
diff --git a/net/core/filter.c b/net/core/filter.c
index a06931c27eeb..99a05199a806 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8265,6 +8265,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 			      struct bpf_insn_access_aux *info)
 {
 	const int size_default = sizeof(__u32);
+	int field_size;
 
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
@@ -8276,7 +8277,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 	case offsetof(struct bpf_sock, family):
 	case offsetof(struct bpf_sock, type):
 	case offsetof(struct bpf_sock, protocol):
-	case offsetof(struct bpf_sock, dst_port):
 	case offsetof(struct bpf_sock, src_port):
 	case offsetof(struct bpf_sock, rx_queue_mapping):
 	case bpf_ctx_range(struct bpf_sock, src_ip4):
@@ -8285,6 +8285,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
 		bpf_ctx_record_field_size(info, size_default);
 		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	case bpf_ctx_range(struct bpf_sock, dst_port):
+		field_size = size == size_default ?
+			size_default : sizeof_field(struct bpf_sock, dst_port);
+		bpf_ctx_record_field_size(info, field_size);
+		return bpf_ctx_narrow_access_ok(off, size, field_size);
+	case offsetofend(struct bpf_sock, dst_port) ...
+	     offsetof(struct bpf_sock, dst_ip4) - 1:
+		return false;
 	}
 
 	return size == size_default;
-- 
cgit v1.2.3


From bfdf4e6208051ed7165b2e92035b4bf11f43eb63 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 27 Jan 2022 10:27:20 -0500
Subject: rseq: Remove broken uapi field layout on 32-bit little endian

The rseq rseq_cs.ptr.{ptr32,padding} uapi endianness handling is
entirely wrong on 32-bit little endian: a preprocessor logic mistake
wrongly uses the big endian field layout on 32-bit little endian
architectures.

Fortunately, those ptr32 accessors were never used within the kernel,
and only meant as a convenience for user-space.

Remove those and replace the whole rseq_cs union by a __u64 type, as
this is the only thing really needed to express the ABI. Document how
32-bit architectures are meant to interact with this field.

Fixes: ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union, update includes")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220127152720.25898-1-mathieu.desnoyers@efficios.com
---
 include/uapi/linux/rseq.h | 20 ++++----------------
 kernel/rseq.c             |  8 ++++----
 2 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 9a402fdb60e9..77ee207623a9 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -105,23 +105,11 @@ struct rseq {
 	 * Read and set by the kernel. Set by user-space with single-copy
 	 * atomicity semantics. This field should only be updated by the
 	 * thread which registered this data structure. Aligned on 64-bit.
+	 *
+	 * 32-bit architectures should update the low order bits of the
+	 * rseq_cs field, leaving the high order bits initialized to 0.
 	 */
-	union {
-		__u64 ptr64;
-#ifdef __LP64__
-		__u64 ptr;
-#else
-		struct {
-#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN)
-			__u32 padding;		/* Initialized to zero. */
-			__u32 ptr32;
-#else /* LITTLE */
-			__u32 ptr32;
-			__u32 padding;		/* Initialized to zero. */
-#endif /* ENDIAN */
-		} ptr;
-#endif
-	} rseq_cs;
+	__u64 rseq_cs;
 
 	/*
 	 * Restartable sequences flags field.
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 6d45ac3dae7f..97ac20b4f738 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	int ret;
 
 #ifdef CONFIG_64BIT
-	if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+	if (get_user(ptr, &t->rseq->rseq_cs))
 		return -EFAULT;
 #else
-	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+	if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
 		return -EFAULT;
 #endif
 	if (!ptr) {
@@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t)
 	 * Set rseq_cs to NULL.
 	 */
 #ifdef CONFIG_64BIT
-	return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+	return put_user(0UL, &t->rseq->rseq_cs);
 #else
-	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+	if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
 		return -EFAULT;
 	return 0;
 #endif
-- 
cgit v1.2.3


From ddecd22878601a606d160680fa85802b75d92eb6 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 31 Jan 2022 11:34:07 +0100
Subject: perf: uapi: Document perf_event_attr::sig_data truncation on 32 bit
 architectures

Due to the alignment requirements of siginfo_t, as described in
3ddb3fd8cdb0 ("signal, perf: Fix siginfo_t by avoiding u64 on 32-bit
architectures"), siginfo_t::si_perf_data is limited to an unsigned long.

However, perf_event_attr::sig_data is an u64, to avoid having to deal
with compat conversions. Due to being an u64, it may not immediately be
clear to users that sig_data is truncated on 32 bit architectures.

Add a comment to explicitly point this out, and hopefully help some
users save time by not having to deduce themselves what's happening.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Link: https://lore.kernel.org/r/20220131103407.1971678-3-elver@google.com
---
 include/uapi/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1b65042ab1db..82858b697c05 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -465,6 +465,8 @@ struct perf_event_attr {
 	/*
 	 * User provided data if sigtrap=1, passed back to user via
 	 * siginfo_t::si_perf_data, e.g. to permit user to identify the event.
+	 * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be
+	 * truncated accordingly on 32 bit architectures.
 	 */
 	__u64	sig_data;
 };
-- 
cgit v1.2.3


From c86d86131ab75696fc52d98571148842e067d620 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 2 Feb 2022 06:09:04 +0300
Subject: Partially revert "net/smc: Add netlink net namespace support"

The change of sizeof(struct smc_diag_linkinfo) by commit 79d39fc503b4
("net/smc: Add netlink net namespace support") introduced an ABI
regression: since struct smc_diag_lgrinfo contains an object of
type "struct smc_diag_linkinfo", offset of all subsequent members
of struct smc_diag_lgrinfo was changed by that change.

As result, applications compiled with the old version
of struct smc_diag_linkinfo will receive garbage in
struct smc_diag_lgrinfo.role if the kernel implements
this new version of struct smc_diag_linkinfo.

Fix this regression by reverting the part of commit 79d39fc503b4 that
changes struct smc_diag_linkinfo.  After all, there is SMC_GEN_NETLINK
interface which is good enough, so there is probably no need to touch
the smc_diag ABI in the first place.

Fixes: 79d39fc503b4 ("net/smc: Add netlink net namespace support")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Karsten Graul <kgraul@linux.ibm.com>
Link: https://lore.kernel.org/r/20220202030904.GA9742@altlinux.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/smc_diag.h | 11 +++++------
 net/smc/smc_diag.c            |  2 --
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
index c7008d87f1a4..8cb3a6fef553 100644
--- a/include/uapi/linux/smc_diag.h
+++ b/include/uapi/linux/smc_diag.h
@@ -84,12 +84,11 @@ struct smc_diag_conninfo {
 /* SMC_DIAG_LINKINFO */
 
 struct smc_diag_linkinfo {
-	__u8		link_id;		    /* link identifier */
-	__u8		ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
-	__u8		ibport;			    /* RDMA device port number */
-	__u8		gid[40];		    /* local GID */
-	__u8		peer_gid[40];		    /* peer GID */
-	__aligned_u64	net_cookie;                 /* RDMA device net namespace */
+	__u8 link_id;			/* link identifier */
+	__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
+	__u8 ibport;			/* RDMA device port number */
+	__u8 gid[40];			/* local GID */
+	__u8 peer_gid[40];		/* peer GID */
 };
 
 struct smc_diag_lgrinfo {
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index b8898c787d23..1fca2f90a9c7 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -146,13 +146,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 	    (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
 	    !list_empty(&smc->conn.lgr->list)) {
 		struct smc_link *link = smc->conn.lnk;
-		struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net);
 
 		struct smc_diag_lgrinfo linfo = {
 			.role = smc->conn.lgr->role,
 			.lnk[0].ibport = link->ibport,
 			.lnk[0].link_id = link->link_id,
-			.lnk[0].net_cookie = net->net_cookie,
 		};
 
 		memcpy(linfo.lnk[0].ibname,
-- 
cgit v1.2.3


From e4b1eb24ce5a696ef7229f9926ff34d7502f0582 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 27 Jan 2022 11:34:53 -0800
Subject: thermal: netlink: Add a new event to notify CPU capabilities change

Add a new netlink event to notify change in CPU capabilities in terms of
performance and efficiency.

Firmware may change CPU capabilities as a result of thermal events in the
system or to account for changes in the TDP (thermal design power) level.

This notification type will allow user space to avoid running workloads
on certain CPUs or proactively adjust power limits to avoid future events.

The netlink message consists of a nested attribute
(THERMAL_GENL_ATTR_CPU_CAPABILITY) with three attributes:

 * THERMAL_GENL_ATTR_CPU_CAPABILITY_ID (type u32):
   -- logical CPU number
 * THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE (type u32):
   -- Scaled performance from 0-1023
 * THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY (type u32):
   -- Scaled efficiency from 0-1023

Reviewed-by: Len Brown <len.brown@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_netlink.c | 53 +++++++++++++++++++++++++++++++++++++++
 drivers/thermal/thermal_netlink.h | 14 +++++++++++
 include/uapi/linux/thermal.h      |  6 ++++-
 3 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index a16dd4d5d710..7c97a091680e 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -43,6 +43,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] =
 	[THERMAL_GENL_ATTR_CDEV_MAX_STATE]	= { .type = NLA_U32 },
 	[THERMAL_GENL_ATTR_CDEV_NAME]		= { .type = NLA_STRING,
 						    .len = THERMAL_NAME_LENGTH },
+	/* CPU capabilities */
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY]		= { .type = NLA_NESTED },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_ID]		= { .type = NLA_U32 },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE]	= { .type = NLA_U32 },
+	[THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY]	= { .type = NLA_U32 },
 };
 
 struct param {
@@ -58,6 +63,8 @@ struct param {
 	int temp;
 	int cdev_state;
 	int cdev_max_state;
+	struct thermal_genl_cpu_caps *cpu_capabilities;
+	int cpu_capabilities_count;
 };
 
 typedef int (*cb_t)(struct param *);
@@ -190,6 +197,42 @@ static int thermal_genl_event_gov_change(struct param *p)
 	return 0;
 }
 
+static int thermal_genl_event_cpu_capability_change(struct param *p)
+{
+	struct thermal_genl_cpu_caps *cpu_cap = p->cpu_capabilities;
+	struct sk_buff *msg = p->msg;
+	struct nlattr *start_cap;
+	int i;
+
+	start_cap = nla_nest_start(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY);
+	if (!start_cap)
+		return -EMSGSIZE;
+
+	for (i = 0; i < p->cpu_capabilities_count; ++i) {
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+				cpu_cap->cpu))
+			goto out_cancel_nest;
+
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+				cpu_cap->performance))
+			goto out_cancel_nest;
+
+		if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
+				cpu_cap->efficiency))
+			goto out_cancel_nest;
+
+		++cpu_cap;
+	}
+
+	nla_nest_end(msg, start_cap);
+
+	return 0;
+out_cancel_nest:
+	nla_nest_cancel(msg, start_cap);
+
+	return -EMSGSIZE;
+}
+
 int thermal_genl_event_tz_delete(struct param *p)
 	__attribute__((alias("thermal_genl_event_tz")));
 
@@ -219,6 +262,7 @@ static cb_t event_cb[] = {
 	[THERMAL_GENL_EVENT_CDEV_DELETE]	= thermal_genl_event_cdev_delete,
 	[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE]	= thermal_genl_event_cdev_state_update,
 	[THERMAL_GENL_EVENT_TZ_GOV_CHANGE]	= thermal_genl_event_gov_change,
+	[THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change,
 };
 
 /*
@@ -356,6 +400,15 @@ int thermal_notify_tz_gov_change(int tz_id, const char *name)
 	return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p);
 }
 
+int thermal_genl_cpu_capability_event(int count,
+				      struct thermal_genl_cpu_caps *caps)
+{
+	struct param p = { .cpu_capabilities_count = count, .cpu_capabilities = caps };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, &p);
+}
+EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event);
+
 /*************************** Command encoding ********************************/
 
 static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz,
diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h
index e554f76291f4..04d1adbbc012 100644
--- a/drivers/thermal/thermal_netlink.h
+++ b/drivers/thermal/thermal_netlink.h
@@ -4,6 +4,12 @@
  *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
  */
 
+struct thermal_genl_cpu_caps {
+	int cpu;
+	int performance;
+	int efficiency;
+};
+
 /* Netlink notification function */
 #ifdef CONFIG_THERMAL_NETLINK
 int __init thermal_netlink_init(void);
@@ -23,6 +29,8 @@ int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state);
 int thermal_notify_cdev_delete(int cdev_id);
 int thermal_notify_tz_gov_change(int tz_id, const char *name);
 int thermal_genl_sampling_temp(int id, int temp);
+int thermal_genl_cpu_capability_event(int count,
+				      struct thermal_genl_cpu_caps *caps);
 #else
 static inline int thermal_netlink_init(void)
 {
@@ -101,4 +109,10 @@ static inline int thermal_genl_sampling_temp(int id, int temp)
 {
 	return 0;
 }
+
+static inline int thermal_genl_cpu_capability_event(int count, struct cpu_capability *caps)
+{
+	return 0;
+}
+
 #endif /* CONFIG_THERMAL_NETLINK */
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index 9aa2fedfa309..fc78bf3aead7 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -44,7 +44,10 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_CDEV_MAX_STATE,
 	THERMAL_GENL_ATTR_CDEV_NAME,
 	THERMAL_GENL_ATTR_GOV_NAME,
-
+	THERMAL_GENL_ATTR_CPU_CAPABILITY,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+	THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
 	__THERMAL_GENL_ATTR_MAX,
 };
 #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
@@ -71,6 +74,7 @@ enum thermal_genl_event {
 	THERMAL_GENL_EVENT_CDEV_DELETE,		/* Cdev unbound */
 	THERMAL_GENL_EVENT_CDEV_STATE_UPDATE,	/* Cdev state updated */
 	THERMAL_GENL_EVENT_TZ_GOV_CHANGE,	/* Governor policy changed  */
+	THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE,	/* CPU capability changed */
 	__THERMAL_GENL_EVENT_MAX,
 };
 #define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1)
-- 
cgit v1.2.3


From d1ca60efc53d665cf89ed847a14a510a81770b81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 2 Feb 2022 12:00:56 +0100
Subject: netfilter: ctnetlink: disable helper autoassign

When userspace, e.g. conntrackd, inserts an entry with a specified helper,
its possible that the helper is lost immediately after its added:

ctnetlink_create_conntrack
  -> nf_ct_helper_ext_add + assign helper
    -> ctnetlink_setup_nat
      -> ctnetlink_parse_nat_setup
         -> parse_nat_setup -> nfnetlink_parse_nat_setup
	                       -> nf_nat_setup_info
                                 -> nf_conntrack_alter_reply
                                   -> __nf_ct_try_assign_helper

... and __nf_ct_try_assign_helper will zero the helper again.

Set IPS_HELPER bit to bypass auto-assign logic, its unwanted, just like
when helper is assigned via ruleset.

Dropped old 'not strictly necessary' comment, it referred to use of
rcu_assign_pointer() before it got replaced by RCU_INIT_POINTER().

NB: Fixes tag intentionally incorrect, this extends the referenced commit,
but this change won't build without IPS_HELPER introduced there.

Fixes: 6714cf5465d280 ("netfilter: nf_conntrack: fix explicit helper attachment and NAT")
Reported-by: Pham Thanh Tuyen <phamtyn@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +-
 net/netfilter/nf_conntrack_netlink.c               | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 4b3395082d15..26071021e986 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -106,7 +106,7 @@ enum ip_conntrack_status {
 	IPS_NAT_CLASH = IPS_UNTRACKED,
 #endif
 
-	/* Conntrack got a helper explicitly attached via CT target. */
+	/* Conntrack got a helper explicitly attached (ruleset, ctnetlink). */
 	IPS_HELPER_BIT = 13,
 	IPS_HELPER = (1 << IPS_HELPER_BIT),
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index ac438370f94a..7032402ffd33 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2311,7 +2311,8 @@ ctnetlink_create_conntrack(struct net *net,
 			if (helper->from_nlattr)
 				helper->from_nlattr(helpinfo, ct);
 
-			/* not in hash table yet so not strictly necessary */
+			/* disable helper auto-assignment for this entry */
+			ct->status |= IPS_HELPER;
 			RCU_INIT_POINTER(help->helper, helper);
 		}
 	} else {
-- 
cgit v1.2.3


From 8b5413647262dda8d8d0e07e14ea1de9ac7cf0b2 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 17 Jan 2022 21:56:13 +0100
Subject: netfilter: nfqueue: enable to get skb->priority

This info could be useful to improve traffic analysis.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h | 1 +
 net/netfilter/nfnetlink_queue.c                | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index aed90c4df0c8..ef7c97f21a15 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -61,6 +61,7 @@ enum nfqnl_attr_type {
 	NFQA_SECCTX,			/* security context string */
 	NFQA_VLAN,			/* nested attribute: packet vlan info */
 	NFQA_L2HDR,			/* full L2 header */
+	NFQA_PRIORITY,			/* skb->priority */
 
 	__NFQA_MAX
 };
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea2d9c2a44cf..48d7a59c6482 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -402,6 +402,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
 #endif
 		+ nla_total_size(sizeof(u_int32_t))	/* mark */
+		+ nla_total_size(sizeof(u_int32_t))	/* priority */
 		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
 		+ nla_total_size(sizeof(u_int32_t))	/* skbinfo */
 		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */
@@ -559,6 +560,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	    nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
 		goto nla_put_failure;
 
+	if (entskb->priority &&
+	    nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+		goto nla_put_failure;
+
 	if (indev && entskb->dev &&
 	    skb_mac_header_was_set(entskb) &&
 	    skb_mac_header_len(entskb) != 0) {
-- 
cgit v1.2.3


From be847673cfffce8bb6e9ed6ae186081280c58831 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Wed, 2 Feb 2022 15:25:53 +0100
Subject: uapi: ioam: Insertion frequency

Add the insertion frequency uapi for IOAM lwtunnels.

Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ioam6_iptunnel.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h
index 829ffdfcacca..38f6a8fdfd34 100644
--- a/include/uapi/linux/ioam6_iptunnel.h
+++ b/include/uapi/linux/ioam6_iptunnel.h
@@ -41,6 +41,15 @@ enum {
 	/* IOAM Trace Header */
 	IOAM6_IPTUNNEL_TRACE,		/* struct ioam6_trace_hdr */
 
+	/* Insertion frequency:
+	 * "k over n" packets (0 < k <= n)
+	 * [0.0001% ... 100%]
+	 */
+#define IOAM6_IPTUNNEL_FREQ_MIN 1
+#define IOAM6_IPTUNNEL_FREQ_MAX 1000000
+	IOAM6_IPTUNNEL_FREQ_K,		/* u32 */
+	IOAM6_IPTUNNEL_FREQ_N,		/* u32 */
+
 	__IOAM6_IPTUNNEL_MAX,
 };
 
-- 
cgit v1.2.3


From 3698807094ecae945436921325f5c309d1123f11 Mon Sep 17 00:00:00 2001
From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Date: Tue, 24 Aug 2021 16:13:41 -0400
Subject: drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs

Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
snapshot a running process and later restore it on same or a remote
machine but expects the processes that have a device file (e.g. GPU)
associated with them, provide necessary driver support to assist CRIU
and its extensible plugin interface. Thus, In order to support the
Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
Kernel driver, needs to provide a set of new APIs that provide
necessary VRAM metadata and its contents to a userspace component
(CRIU plugin) that can store it in form of image files.

This introduces some new ioctls which will be used to checkpoint-Restore
any KFD bound user process. KFD only allows ioctl calls from the same
process that opened the KFD file descriptor. Since these ioctls are
expected to be called from a KFD criu plugin which has elevated ptrace
attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with
the file descriptors so modify KFD to allow such calls.

(API redesigned by David Yat Sin)
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 98 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 65 ++++++++++++++++++++-
 include/uapi/linux/kfd_ioctl.h           | 81 +++++++++++++++++++++++++-
 3 files changed, 241 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 214a2c67fba4..90e6d9e335a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -33,6 +33,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/ptrace.h>
 #include <linux/dma-buf.h>
 #include <asm/processor.h>
 #include "kfd_priv.h"
@@ -1859,6 +1860,75 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 }
 #endif
 
+static int criu_checkpoint(struct file *filep,
+			   struct kfd_process *p,
+			   struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_restore(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_unpause(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_resume(struct file *filep,
+			struct kfd_process *p,
+			struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int criu_process_info(struct file *filep,
+				struct kfd_process *p,
+				struct kfd_ioctl_criu_args *args)
+{
+	return 0;
+}
+
+static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_criu_args *args = data;
+	int ret;
+
+	dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
+	switch (args->op) {
+	case KFD_CRIU_OP_PROCESS_INFO:
+		ret = criu_process_info(filep, p, args);
+		break;
+	case KFD_CRIU_OP_CHECKPOINT:
+		ret = criu_checkpoint(filep, p, args);
+		break;
+	case KFD_CRIU_OP_UNPAUSE:
+		ret = criu_unpause(filep, p, args);
+		break;
+	case KFD_CRIU_OP_RESTORE:
+		ret = criu_restore(filep, p, args);
+		break;
+	case KFD_CRIU_OP_RESUME:
+		ret = criu_resume(filep, p, args);
+		break;
+	default:
+		dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret);
+
+	return ret;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -1962,6 +2032,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
 			kfd_ioctl_set_xnack_mode, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
+			kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
+
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
@@ -1976,6 +2050,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	char *kdata = NULL;
 	unsigned int usize, asize;
 	int retcode = -EINVAL;
+	bool ptrace_attached = false;
 
 	if (nr >= AMDKFD_CORE_IOCTL_COUNT)
 		goto err_i1;
@@ -2001,7 +2076,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	 * processes need to create their own KFD device context.
 	 */
 	process = filep->private_data;
-	if (process->lead_thread != current->group_leader) {
+
+	rcu_read_lock();
+	if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) &&
+	    ptrace_parent(process->lead_thread) == current)
+		ptrace_attached = true;
+	rcu_read_unlock();
+
+	if (process->lead_thread != current->group_leader
+	    && !ptrace_attached) {
 		dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
 		retcode = -EBADF;
 		goto err_i1;
@@ -2016,6 +2099,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 		goto err_i1;
 	}
 
+	/*
+	 * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support
+	 * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a
+	 * more priviledged access.
+	 */
+	if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
+		if (!capable(CAP_CHECKPOINT_RESTORE) &&
+						!capable(CAP_SYS_ADMIN)) {
+			retcode = -EACCES;
+			goto err_i1;
+		}
+	}
+
 	if (cmd & (IOC_IN | IOC_OUT)) {
 		if (asize <= sizeof(stack_kdata)) {
 			kdata = stack_kdata;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ea68f3b3a4e9..f928878196ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -121,7 +121,26 @@
  */
 #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512
 
-
+/**
+ * enum kfd_ioctl_flags - KFD ioctl flags
+ * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how
+ * userspace can use a given ioctl.
+ */
+enum kfd_ioctl_flags {
+	/*
+	 * @KFD_IOC_FLAG_CHECKPOINT_RESTORE:
+	 * Certain KFD ioctls such as AMDKFD_IOC_CRIU_OP can potentially
+	 * perform privileged operations and load arbitrary data into MQDs and
+	 * eventually HQD registers when the queue is mapped by HWS. In order to
+	 * prevent this we should perform additional security checks.
+	 *
+	 * This is equivalent to callers with the CHECKPOINT_RESTORE capability.
+	 *
+	 * Note: Since earlier versions of docker do not support CHECKPOINT_RESTORE,
+	 * we also allow ioctls with SYS_ADMIN capability.
+	 */
+	KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0),
+};
 /*
  * Kernel module parameter to specify maximum number of supported queues per
  * device
@@ -1006,6 +1025,50 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CRIU */
+/*
+ * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
+ * structures:
+ * kfd_criu_process_priv_data
+ * kfd_criu_device_priv_data
+ * kfd_criu_bo_priv_data
+ * kfd_criu_queue_priv_data
+ * kfd_criu_event_priv_data
+ * kfd_criu_svm_range_priv_data
+ */
+
+#define KFD_CRIU_PRIV_VERSION 1
+
+struct kfd_criu_process_priv_data {
+	uint32_t version;
+};
+
+struct kfd_criu_device_priv_data {
+	/* For future use */
+	uint64_t reserved;
+};
+
+struct kfd_criu_bo_priv_data {
+	uint64_t reserved;
+};
+
+struct kfd_criu_svm_range_priv_data {
+	uint32_t object_type;
+	uint32_t reserved;
+};
+
+struct kfd_criu_queue_priv_data {
+	uint32_t object_type;
+	uint32_t reserved;
+};
+
+struct kfd_criu_event_priv_data {
+	uint32_t object_type;
+	uint32_t reserved;
+};
+
+/* CRIU - End */
+
 /* Queue Context Management */
 int init_queue(struct queue **q, const struct queue_properties *properties);
 void uninit_queue(struct queue *q);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..49429a6c42fc 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -468,6 +468,82 @@ struct kfd_ioctl_smi_events_args {
 	__u32 anon_fd;	/* from KFD */
 };
 
+/**************************************************************************************************
+ * CRIU IOCTLs (Checkpoint Restore In Userspace)
+ *
+ * When checkpointing a process, the userspace application will perform:
+ * 1. PROCESS_INFO op to determine current process information. This pauses execution and evicts
+ *    all the queues.
+ * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges)
+ * 3. UNPAUSE op to un-evict all the queues
+ *
+ * When restoring a process, the CRIU userspace application will perform:
+ *
+ * 1. RESTORE op to restore process contents
+ * 2. RESUME op to start the process
+ *
+ * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. User
+ * application needs to perform an UNPAUSE operation after calling PROCESS_INFO.
+ */
+
+enum kfd_criu_op {
+	KFD_CRIU_OP_PROCESS_INFO,
+	KFD_CRIU_OP_CHECKPOINT,
+	KFD_CRIU_OP_UNPAUSE,
+	KFD_CRIU_OP_RESTORE,
+	KFD_CRIU_OP_RESUME,
+};
+
+/**
+ * kfd_ioctl_criu_args - Arguments perform CRIU operation
+ * @devices:		[in/out] User pointer to memory location for devices information.
+ * 			This is an array of type kfd_criu_device_bucket.
+ * @bos:		[in/out] User pointer to memory location for BOs information
+ * 			This is an array of type kfd_criu_bo_bucket.
+ * @priv_data:		[in/out] User pointer to memory location for private data
+ * @priv_data_size:	[in/out] Size of priv_data in bytes
+ * @num_devices:	[in/out] Number of GPUs used by process. Size of @devices array.
+ * @num_bos		[in/out] Number of BOs used by process. Size of @bos array.
+ * @num_objects:	[in/out] Number of objects used by process. Objects are opaque to
+ *				 user application.
+ * @pid:		[in/out] PID of the process being checkpointed
+ * @op			[in] Type of operation (kfd_criu_op)
+ *
+ * Return: 0 on success, -errno on failure
+ */
+struct kfd_ioctl_criu_args {
+	__u64 devices;		/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 bos;		/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 priv_data;	/* Used during ops: CHECKPOINT, RESTORE */
+	__u64 priv_data_size;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_devices;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_bos;		/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 num_objects;	/* Used during ops: PROCESS_INFO, RESTORE */
+	__u32 pid;		/* Used during ops: PROCESS_INFO, RESUME */
+	__u32 op;
+};
+
+struct kfd_criu_device_bucket {
+	__u32 user_gpu_id;
+	__u32 actual_gpu_id;
+	__u32 drm_fd;
+	__u32 pad;
+};
+
+struct kfd_criu_bo_bucket {
+	__u64 addr;
+	__u64 size;
+	__u64 offset;
+	__u64 restored_offset;    /* During restore, updated offset for BO */
+	__u32 gpu_id;             /* This is the user_gpu_id */
+	__u32 alloc_flags;
+	__u32 dmabuf_fd;
+	__u32 pad;
+};
+
+/* CRIU IOCTLs - END */
+/**************************************************************************************************/
+
 /* Register offset inside the remapped mmio page
  */
 enum kfd_mmio_remap {
@@ -742,7 +818,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE		\
 		AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_CRIU_OP			\
+		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x22
+#define AMDKFD_COMMAND_END		0x23
 
 #endif
-- 
cgit v1.2.3


From 692996f2bef7aa1737e07554255ba0d9a73fb750 Mon Sep 17 00:00:00 2001
From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Date: Tue, 18 Jan 2022 01:47:58 -0500
Subject: drm/amdkfd: Bump up KFD API version for CRIU

 - Change KFD minor version to 7 for CRIU

Proposed userspace changes:
https://github.com/RadeonOpenCompute/criu

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 49429a6c42fc..e6a56c146920 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Checkpoint Restore (CRIU) API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
cgit v1.2.3


From 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Tue, 8 Feb 2022 16:14:32 +0200
Subject: xfrm: enforce validity of offload input flags

struct xfrm_user_offload has flags variable that received user input,
but kernel didn't check if valid bits were provided. It caused a situation
where not sanitized input was forwarded directly to the drivers.

For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by
strongswan, but not implemented in the kernel at all.

As a solution, check and sanitize input flags to forward
XFRM_OFFLOAD_INBOUND to the drivers.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 6 ++++++
 net/xfrm/xfrm_device.c    | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 4e29d7851890..65e13a099b1a 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -511,6 +511,12 @@ struct xfrm_user_offload {
 	int				ifindex;
 	__u8				flags;
 };
+/* This flag was exposed without any kernel code that supporting it.
+ * Unfortunately, strongswan has the code that uses sets this flag,
+ * which makes impossible to reuse this bit.
+ *
+ * So leave it here to make sure that it won't be reused by mistake.
+ */
 #define XFRM_OFFLOAD_IPV6	1
 #define XFRM_OFFLOAD_INBOUND	2
 
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 3fa066419d37..39bce5d764de 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	if (x->encap || x->tfcpad)
 		return -EINVAL;
 
+	if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND))
+		return -EINVAL;
+
 	dev = dev_get_by_index(net, xuo->ifindex);
 	if (!dev) {
 		if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
@@ -262,7 +265,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
 	xso->real_dev = dev;
 	xso->num_exthdrs = 1;
-	xso->flags = xuo->flags;
+	/* Don't forward bit that is not implemented */
+	xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6;
 
 	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
 	if (err) {
-- 
cgit v1.2.3


From 63ed1aab3d40aa61aaa66819bdce9377ac7f40fa Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Wed, 9 Feb 2022 12:05:57 +0800
Subject: mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control

This change adds a couple of new ioctls for mctp sockets:
SIOCMCTPALLOCTAG and SIOCMCTPDROPTAG.  These ioctls provide facilities
for explicit allocation / release of tags, overriding the automatic
allocate-on-send/release-on-reply and timeout behaviours. This allows
userspace more control over messages that may not fit a simple
request/response model.

In order to indicate a pre-allocated tag to the sendmsg() syscall, we
introduce a new flag to the struct sockaddr_mctp.smctp_tag value:
MCTP_TAG_PREALLOC.

Additional changes from Jeremy Kerr <jk@codeconstruct.com.au>.

Contains a fix that was:
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/mctp.rst |  48 ++++++++++
 include/net/mctp.h                |  11 ++-
 include/trace/events/mctp.h       |   5 +-
 include/uapi/linux/mctp.h         |  18 ++++
 net/mctp/af_mctp.c                | 189 ++++++++++++++++++++++++++++++++------
 net/mctp/route.c                  | 114 +++++++++++++++++------
 6 files changed, 329 insertions(+), 56 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/mctp.rst b/Documentation/networking/mctp.rst
index 46f74bffce0f..c628cb5406d2 100644
--- a/Documentation/networking/mctp.rst
+++ b/Documentation/networking/mctp.rst
@@ -212,6 +212,54 @@ remote address is already known, or the message does not require a reply.
 Like the send calls, sockets will only receive responses to requests they have
 sent (TO=1) and may only respond (TO=0) to requests they have received.
 
+``ioctl(SIOCMCTPALLOCTAG)`` and ``ioctl(SIOCMCTPDROPTAG)``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These tags give applications more control over MCTP message tags, by allocating
+(and dropping) tag values explicitly, rather than the kernel automatically
+allocating a per-message tag at ``sendmsg()`` time.
+
+In general, you will only need to use these ioctls if your MCTP protocol does
+not fit the usual request/response model. For example, if you need to persist
+tags across multiple requests, or a request may generate more than one response.
+In these cases, the ioctls allow you to decouple the tag allocation (and
+release) from individual message send and receive operations.
+
+Both ioctls are passed a pointer to a ``struct mctp_ioc_tag_ctl``:
+
+.. code-block:: C
+
+    struct mctp_ioc_tag_ctl {
+        mctp_eid_t      peer_addr;
+        __u8		tag;
+        __u16   	flags;
+    };
+
+``SIOCMCTPALLOCTAG`` allocates a tag for a specific peer, which an application
+can use in future ``sendmsg()`` calls. The application populates the
+``peer_addr`` member with the remote EID. Other fields must be zero.
+
+On return, the ``tag`` member will be populated with the allocated tag value.
+The allocated tag will have the following tag bits set:
+
+ - ``MCTP_TAG_OWNER``: it only makes sense to allocate tags if you're the tag
+   owner
+
+ - ``MCTP_TAG_PREALLOC``: to indicate to ``sendmsg()`` that this is a
+   preallocated tag.
+
+ - ... and the actual tag value, within the least-significant three bits
+   (``MCTP_TAG_MASK``). Note that zero is a valid tag value.
+
+The tag value should be used as-is for the ``smctp_tag`` member of ``struct
+sockaddr_mctp``.
+
+``SIOCMCTPDROPTAG`` releases a tag that has been previously allocated by a
+``SIOCMCTPALLOCTAG`` ioctl. The ``peer_addr`` must be the same as used for the
+allocation, and the ``tag`` value must match exactly the tag returned from the
+allocation (including the ``MCTP_TAG_OWNER`` and ``MCTP_TAG_PREALLOC`` bits).
+The ``flags`` field must be zero.
+
 Kernel internals
 ================
 
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 706d329dd8e8..e80a4baf8379 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -126,7 +126,7 @@ struct mctp_sock {
  */
 struct mctp_sk_key {
 	mctp_eid_t	peer_addr;
-	mctp_eid_t	local_addr;
+	mctp_eid_t	local_addr; /* MCTP_ADDR_ANY for local owned tags */
 	__u8		tag; /* incoming tag match; invert TO for local */
 
 	/* we hold a ref to sk when set */
@@ -163,6 +163,12 @@ struct mctp_sk_key {
 	 */
 	unsigned long	dev_flow_state;
 	struct mctp_dev	*dev;
+
+	/* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire
+	 * automatically on timeout or response, instead SIOCMCTPDROPTAG
+	 * is used.
+	 */
+	bool		manual_alloc;
 };
 
 struct mctp_skb_cb {
@@ -239,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
 
 void mctp_key_unref(struct mctp_sk_key *key);
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+					 mctp_eid_t daddr, mctp_eid_t saddr,
+					 bool manual, u8 *tagp);
 
 /* routing <--> device interface */
 unsigned int mctp_default_net(struct net *net);
diff --git a/include/trace/events/mctp.h b/include/trace/events/mctp.h
index 175b057c507f..165cf25f77a7 100644
--- a/include/trace/events/mctp.h
+++ b/include/trace/events/mctp.h
@@ -15,6 +15,7 @@ enum {
 	MCTP_TRACE_KEY_REPLIED,
 	MCTP_TRACE_KEY_INVALIDATED,
 	MCTP_TRACE_KEY_CLOSED,
+	MCTP_TRACE_KEY_DROPPED,
 };
 #endif /* __TRACE_MCTP_ENUMS */
 
@@ -22,6 +23,7 @@ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED);
 TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED);
+TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED);
 
 TRACE_EVENT(mctp_key_acquire,
 	TP_PROTO(const struct mctp_sk_key *key),
@@ -66,7 +68,8 @@ TRACE_EVENT(mctp_key_release,
 				 { MCTP_TRACE_KEY_TIMEOUT, "timeout" },
 				 { MCTP_TRACE_KEY_REPLIED, "replied" },
 				 { MCTP_TRACE_KEY_INVALIDATED, "invalidated" },
-				 { MCTP_TRACE_KEY_CLOSED, "closed" })
+				 { MCTP_TRACE_KEY_CLOSED, "closed" },
+				 { MCTP_TRACE_KEY_DROPPED, "dropped" })
 	)
 );
 
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 07b0318716fc..154ab56651f1 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -44,7 +44,25 @@ struct sockaddr_mctp_ext {
 
 #define MCTP_TAG_MASK		0x07
 #define MCTP_TAG_OWNER		0x08
+#define MCTP_TAG_PREALLOC	0x10
 
 #define MCTP_OPT_ADDR_EXT	1
 
+#define SIOCMCTPALLOCTAG	(SIOCPROTOPRIVATE + 0)
+#define SIOCMCTPDROPTAG		(SIOCPROTOPRIVATE + 1)
+
+struct mctp_ioc_tag_ctl {
+	mctp_eid_t	peer_addr;
+
+	/* For SIOCMCTPALLOCTAG: must be passed as zero, kernel will
+	 * populate with the allocated tag value. Returned tag value will
+	 * always have TO and PREALLOC set.
+	 *
+	 * For SIOCMCTPDROPTAG: userspace provides tag value to drop, from
+	 * a prior SIOCMCTPALLOCTAG call (and so must have TO and PREALLOC set).
+	 */
+	__u8		tag;
+	__u16		flags;
+};
+
 #endif /* __UAPI_MCTP_H */
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index c921de63b494..f0702d920d8d 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -6,6 +6,7 @@
  * Copyright (c) 2021 Google
  */
 
+#include <linux/compat.h>
 #include <linux/if_arp.h>
 #include <linux/net.h>
 #include <linux/mctp.h>
@@ -21,6 +22,8 @@
 
 /* socket implementation */
 
+static void mctp_sk_expire_keys(struct timer_list *timer);
+
 static int mctp_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -99,13 +102,20 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	struct sk_buff *skb;
 
 	if (addr) {
+		const u8 tagbits = MCTP_TAG_MASK | MCTP_TAG_OWNER |
+			MCTP_TAG_PREALLOC;
+
 		if (addrlen < sizeof(struct sockaddr_mctp))
 			return -EINVAL;
 		if (addr->smctp_family != AF_MCTP)
 			return -EINVAL;
 		if (!mctp_sockaddr_is_ok(addr))
 			return -EINVAL;
-		if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
+		if (addr->smctp_tag & ~tagbits)
+			return -EINVAL;
+		/* can't preallocate a non-owned tag */
+		if (addr->smctp_tag & MCTP_TAG_PREALLOC &&
+		    !(addr->smctp_tag & MCTP_TAG_OWNER))
 			return -EINVAL;
 
 	} else {
@@ -248,6 +258,32 @@ out_free:
 	return rc;
 }
 
+/* We're done with the key; invalidate, stop reassembly, and remove from lists.
+ */
+static void __mctp_key_remove(struct mctp_sk_key *key, struct net *net,
+			      unsigned long flags, unsigned long reason)
+__releases(&key->lock)
+__must_hold(&net->mctp.keys_lock)
+{
+	struct sk_buff *skb;
+
+	trace_mctp_key_release(key, reason);
+	skb = key->reasm_head;
+	key->reasm_head = NULL;
+	key->reasm_dead = true;
+	key->valid = false;
+	mctp_dev_release_key(key->dev, key);
+	spin_unlock_irqrestore(&key->lock, flags);
+
+	hlist_del(&key->hlist);
+	hlist_del(&key->sklist);
+
+	/* unref for the lists */
+	mctp_key_unref(key);
+
+	kfree_skb(skb);
+}
+
 static int mctp_setsockopt(struct socket *sock, int level, int optname,
 			   sockptr_t optval, unsigned int optlen)
 {
@@ -293,6 +329,115 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname,
 	return -EINVAL;
 }
 
+static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
+{
+	struct net *net = sock_net(&msk->sk);
+	struct mctp_sk_key *key = NULL;
+	struct mctp_ioc_tag_ctl ctl;
+	unsigned long flags;
+	u8 tag;
+
+	if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.tag)
+		return -EINVAL;
+
+	if (ctl.flags)
+		return -EINVAL;
+
+	key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY,
+				   true, &tag);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC;
+	if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) {
+		spin_lock_irqsave(&key->lock, flags);
+		__mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED);
+		mctp_key_unref(key);
+		return -EFAULT;
+	}
+
+	mctp_key_unref(key);
+	return 0;
+}
+
+static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
+{
+	struct net *net = sock_net(&msk->sk);
+	struct mctp_ioc_tag_ctl ctl;
+	unsigned long flags, fl2;
+	struct mctp_sk_key *key;
+	struct hlist_node *tmp;
+	int rc;
+	u8 tag;
+
+	if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.flags)
+		return -EINVAL;
+
+	/* Must be a local tag, TO set, preallocated */
+	if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC))
+		return -EINVAL;
+
+	tag = ctl.tag & MCTP_TAG_MASK;
+	rc = -EINVAL;
+
+	spin_lock_irqsave(&net->mctp.keys_lock, flags);
+	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
+		/* we do an irqsave here, even though we know the irq state,
+		 * so we have the flags to pass to __mctp_key_remove
+		 */
+		spin_lock_irqsave(&key->lock, fl2);
+		if (key->manual_alloc &&
+		    ctl.peer_addr == key->peer_addr &&
+		    tag == key->tag) {
+			__mctp_key_remove(key, net, fl2,
+					  MCTP_TRACE_KEY_DROPPED);
+			rc = 0;
+		} else {
+			spin_unlock_irqrestore(&key->lock, fl2);
+		}
+	}
+	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+
+	return rc;
+}
+
+static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk);
+
+	switch (cmd) {
+	case SIOCMCTPALLOCTAG:
+		return mctp_ioctl_alloctag(msk, arg);
+	case SIOCMCTPDROPTAG:
+		return mctp_ioctl_droptag(msk, arg);
+	}
+
+	return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+static int mctp_compat_ioctl(struct socket *sock, unsigned int cmd,
+			     unsigned long arg)
+{
+	void __user *argp = compat_ptr(arg);
+
+	switch (cmd) {
+	/* These have compatible ptr layouts */
+	case SIOCMCTPALLOCTAG:
+	case SIOCMCTPDROPTAG:
+		return mctp_ioctl(sock, cmd, (unsigned long)argp);
+	}
+
+	return -ENOIOCTLCMD;
+}
+#endif
+
 static const struct proto_ops mctp_dgram_ops = {
 	.family		= PF_MCTP,
 	.release	= mctp_release,
@@ -302,7 +447,7 @@ static const struct proto_ops mctp_dgram_ops = {
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
 	.poll		= datagram_poll,
-	.ioctl		= sock_no_ioctl,
+	.ioctl		= mctp_ioctl,
 	.gettstamp	= sock_gettstamp,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
@@ -312,6 +457,9 @@ static const struct proto_ops mctp_dgram_ops = {
 	.recvmsg	= mctp_recvmsg,
 	.mmap		= sock_no_mmap,
 	.sendpage	= sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= mctp_compat_ioctl,
+#endif
 };
 
 static void mctp_sk_expire_keys(struct timer_list *timer)
@@ -319,7 +467,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
 	struct mctp_sock *msk = container_of(timer, struct mctp_sock,
 					     key_expiry);
 	struct net *net = sock_net(&msk->sk);
-	unsigned long next_expiry, flags;
+	unsigned long next_expiry, flags, fl2;
 	struct mctp_sk_key *key;
 	struct hlist_node *tmp;
 	bool next_expiry_valid = false;
@@ -327,15 +475,16 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
 
 	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
-		spin_lock(&key->lock);
+		/* don't expire. manual_alloc is immutable, no locking
+		 * required.
+		 */
+		if (key->manual_alloc)
+			continue;
 
+		spin_lock_irqsave(&key->lock, fl2);
 		if (!time_after_eq(key->expiry, jiffies)) {
-			trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT);
-			key->valid = false;
-			hlist_del_rcu(&key->hlist);
-			hlist_del_rcu(&key->sklist);
-			spin_unlock(&key->lock);
-			mctp_key_unref(key);
+			__mctp_key_remove(key, net, fl2,
+					  MCTP_TRACE_KEY_TIMEOUT);
 			continue;
 		}
 
@@ -346,7 +495,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer)
 			next_expiry = key->expiry;
 			next_expiry_valid = true;
 		}
-		spin_unlock(&key->lock);
+		spin_unlock_irqrestore(&key->lock, fl2);
 	}
 
 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
@@ -387,9 +536,9 @@ static void mctp_sk_unhash(struct sock *sk)
 {
 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
 	struct net *net = sock_net(sk);
+	unsigned long flags, fl2;
 	struct mctp_sk_key *key;
 	struct hlist_node *tmp;
-	unsigned long flags;
 
 	/* remove from any type-based binds */
 	mutex_lock(&net->mctp.bind_lock);
@@ -399,20 +548,8 @@ static void mctp_sk_unhash(struct sock *sk)
 	/* remove tag allocations */
 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
 	hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) {
-		hlist_del(&key->sklist);
-		hlist_del(&key->hlist);
-
-		trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED);
-
-		spin_lock(&key->lock);
-		kfree_skb(key->reasm_head);
-		key->reasm_head = NULL;
-		key->reasm_dead = true;
-		key->valid = false;
-		spin_unlock(&key->lock);
-
-		/* key is no longer on the lookup lists, unref */
-		mctp_key_unref(key);
+		spin_lock_irqsave(&key->lock, fl2);
+		__mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED);
 	}
 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
 }
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 35f72e99e188..17e3482aa770 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -203,29 +203,38 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
 	return rc;
 }
 
-/* We're done with the key; unset valid and remove from lists. There may still
- * be outstanding refs on the key though...
+/* Helper for mctp_route_input().
+ * We're done with the key; unlock and unref the key.
+ * For the usual case of automatic expiry we remove the key from lists.
+ * In the case that manual allocation is set on a key we release the lock
+ * and local ref, reset reassembly, but don't remove from lists.
  */
-static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
-				   unsigned long flags)
-	__releases(&key->lock)
+static void __mctp_key_done_in(struct mctp_sk_key *key, struct net *net,
+			       unsigned long flags, unsigned long reason)
+__releases(&key->lock)
 {
 	struct sk_buff *skb;
 
+	trace_mctp_key_release(key, reason);
 	skb = key->reasm_head;
 	key->reasm_head = NULL;
-	key->reasm_dead = true;
-	key->valid = false;
-	mctp_dev_release_key(key->dev, key);
+
+	if (!key->manual_alloc) {
+		key->reasm_dead = true;
+		key->valid = false;
+		mctp_dev_release_key(key->dev, key);
+	}
 	spin_unlock_irqrestore(&key->lock, flags);
 
-	spin_lock_irqsave(&net->mctp.keys_lock, flags);
-	hlist_del(&key->hlist);
-	hlist_del(&key->sklist);
-	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+	if (!key->manual_alloc) {
+		spin_lock_irqsave(&net->mctp.keys_lock, flags);
+		hlist_del(&key->hlist);
+		hlist_del(&key->sklist);
+		spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
 
-	/* one unref for the lists */
-	mctp_key_unref(key);
+		/* unref for the lists */
+		mctp_key_unref(key);
+	}
 
 	/* and one for the local reference */
 	mctp_key_unref(key);
@@ -379,9 +388,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 				/* we've hit a pending reassembly; not much we
 				 * can do but drop it
 				 */
-				trace_mctp_key_release(key,
-						       MCTP_TRACE_KEY_REPLIED);
-				__mctp_key_unlock_drop(key, net, f);
+				__mctp_key_done_in(key, net, f,
+						   MCTP_TRACE_KEY_REPLIED);
 				key = NULL;
 			}
 			rc = 0;
@@ -423,9 +431,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 		} else {
 			if (key->reasm_head || key->reasm_dead) {
 				/* duplicate start? drop everything */
-				trace_mctp_key_release(key,
-						       MCTP_TRACE_KEY_INVALIDATED);
-				__mctp_key_unlock_drop(key, net, f);
+				__mctp_key_done_in(key, net, f,
+						   MCTP_TRACE_KEY_INVALIDATED);
 				rc = -EEXIST;
 				key = NULL;
 			} else {
@@ -448,10 +455,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 		 * the reassembly/response key
 		 */
 		if (!rc && flags & MCTP_HDR_FLAG_EOM) {
+			msk = container_of(key->sk, struct mctp_sock, sk);
 			sock_queue_rcv_skb(key->sk, key->reasm_head);
 			key->reasm_head = NULL;
-			trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED);
-			__mctp_key_unlock_drop(key, net, f);
+			__mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED);
 			key = NULL;
 		}
 
@@ -579,9 +586,9 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 /* Allocate a locally-owned tag value for (saddr, daddr), and reserve
  * it for the socket msk
  */
-static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
-						mctp_eid_t saddr,
-						mctp_eid_t daddr, u8 *tagp)
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+					 mctp_eid_t daddr, mctp_eid_t saddr,
+					 bool manual, u8 *tagp)
 {
 	struct net *net = sock_net(&msk->sk);
 	struct netns_mctp *mns = &net->mctp;
@@ -636,6 +643,7 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 		mctp_reserve_tag(net, key, msk);
 		trace_mctp_key_acquire(key);
 
+		key->manual_alloc = manual;
 		*tagp = key->tag;
 	}
 
@@ -649,6 +657,50 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 	return key;
 }
 
+static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
+						    mctp_eid_t daddr,
+						    u8 req_tag, u8 *tagp)
+{
+	struct net *net = sock_net(&msk->sk);
+	struct netns_mctp *mns = &net->mctp;
+	struct mctp_sk_key *key, *tmp;
+	unsigned long flags;
+
+	req_tag &= ~(MCTP_TAG_PREALLOC | MCTP_TAG_OWNER);
+	key = NULL;
+
+	spin_lock_irqsave(&mns->keys_lock, flags);
+
+	hlist_for_each_entry(tmp, &mns->keys, hlist) {
+		if (tmp->tag != req_tag)
+			continue;
+
+		if (!mctp_address_matches(tmp->peer_addr, daddr))
+			continue;
+
+		if (!tmp->manual_alloc)
+			continue;
+
+		spin_lock(&tmp->lock);
+		if (tmp->valid) {
+			key = tmp;
+			refcount_inc(&key->refs);
+			spin_unlock(&tmp->lock);
+			break;
+		}
+		spin_unlock(&tmp->lock);
+	}
+	spin_unlock_irqrestore(&mns->keys_lock, flags);
+
+	if (!key)
+		return ERR_PTR(-ENOENT);
+
+	if (tagp)
+		*tagp = key->tag;
+
+	return key;
+}
+
 /* routing lookups */
 static bool mctp_rt_match_eid(struct mctp_route *rt,
 			      unsigned int net, mctp_eid_t eid)
@@ -843,8 +895,14 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	if (rc)
 		goto out_release;
 
-	if (req_tag & MCTP_HDR_FLAG_TO) {
-		key = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
+	if (req_tag & MCTP_TAG_OWNER) {
+		if (req_tag & MCTP_TAG_PREALLOC)
+			key = mctp_lookup_prealloc_tag(msk, daddr,
+						       req_tag, &tag);
+		else
+			key = mctp_alloc_local_tag(msk, daddr, saddr,
+						   false, &tag);
+
 		if (IS_ERR(key)) {
 			rc = PTR_ERR(key);
 			goto out_release;
@@ -855,7 +913,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		tag |= MCTP_HDR_FLAG_TO;
 	} else {
 		key = NULL;
-		tag = req_tag;
+		tag = req_tag & MCTP_TAG_MASK;
 	}
 
 	skb->protocol = htons(ETH_P_MCTP);
-- 
cgit v1.2.3


From 9a69e2b385f443f244a7e8b8bcafe5ccfb0866b4 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 9 Feb 2022 19:43:32 +0100
Subject: bpf: Make remote_port field in struct bpf_sk_lookup 16-bit wide

remote_port is another case of a BPF context field documented as a 32-bit
value in network byte order for which the BPF context access converter
generates a load of a zero-padded 16-bit integer in network byte order.

First such case was dst_port in bpf_sock which got addressed in commit
4421a582718a ("bpf: Make dst_port field in struct bpf_sock 16-bit wide").

Loading 4-bytes from the remote_port offset and converting the value with
bpf_ntohl() leads to surprising results, as the expected value is shifted
by 16 bits.

Reduce the confusion by splitting the field in two - a 16-bit field holding
a big-endian integer, and a 16-bit zero-padding anonymous field that
follows it.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220209184333.654927-2-jakub@cloudflare.com
---
 include/uapi/linux/bpf.h | 3 ++-
 net/bpf/test_run.c       | 4 ++--
 net/core/filter.c        | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a7f0ddedac1f..afe3d0d7f5f2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6453,7 +6453,8 @@ struct bpf_sk_lookup {
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
 	__u32 remote_ip4;	/* Network byte order */
 	__u32 remote_ip6[4];	/* Network byte order */
-	__u32 remote_port;	/* Network byte order */
+	__be16 remote_port;	/* Network byte order */
+	__u16 :16;		/* Zero padding */
 	__u32 local_ip4;	/* Network byte order */
 	__u32 local_ip6[4];	/* Network byte order */
 	__u32 local_port;	/* Host byte order */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index cb150f756f3d..f08034500813 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1147,7 +1147,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
 	if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
 		goto out;
 
-	if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+	if (user_ctx->local_port > U16_MAX) {
 		ret = -ERANGE;
 		goto out;
 	}
@@ -1155,7 +1155,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
 	ctx.family = (u16)user_ctx->family;
 	ctx.protocol = (u16)user_ctx->protocol;
 	ctx.dport = (u16)user_ctx->local_port;
-	ctx.sport = (__force __be16)user_ctx->remote_port;
+	ctx.sport = user_ctx->remote_port;
 
 	switch (ctx.family) {
 	case AF_INET:
diff --git a/net/core/filter.c b/net/core/filter.c
index 99a05199a806..83f06d3b2c52 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10843,7 +10843,8 @@ static bool sk_lookup_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
 	case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
 	case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
-	case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+	case offsetof(struct bpf_sk_lookup, remote_port) ...
+	     offsetof(struct bpf_sk_lookup, local_ip4) - 1:
 	case bpf_ctx_range(struct bpf_sk_lookup, local_port):
 	case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
 		bpf_ctx_record_field_size(info, sizeof(__u32));
-- 
cgit v1.2.3


From 5bdd3eb253544b1e80f904e1205699d0a126d2d6 Mon Sep 17 00:00:00 2001
From: Mukul Joshi <mukul.joshi@amd.com>
Date: Fri, 4 Feb 2022 11:58:32 -0500
Subject: drm/amdkfd: Remove unused old debugger implementation

Cleanup the kfd code by removing the unused old debugger
implementation.
The address watch was only ever implemented in the upstream
driver for GFXv7 (Kaveri). The user mode tools runtime using
this API was never open-sourced. Work on the old debugger
prototype that used this API has been discontinued years ago.
Only a small piece of resetting wavefronts is kept and
is moved to kfd_device_queue_manager.c.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c   |   3 -
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c    |   3 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c |  24 -
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c   |  25 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c  |  96 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c  |  24 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c  |  24 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h  |  10 -
 drivers/gpu/drm/amd/amdkfd/Makefile                |   2 -
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 290 +------
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            | 845 ---------------------
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h            | 230 ------
 drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c            | 158 ----
 drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h            | 293 -------
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |   2 -
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  59 ++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  35 +
 drivers/gpu/drm/amd/amdkfd/kfd_iommu.c             |  12 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   5 -
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  19 -
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h    |   9 -
 include/uapi/linux/kfd_ioctl.h                     |   8 +-
 22 files changed, 106 insertions(+), 2070 deletions(-)
 delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
 delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
 delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
 delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 46cd4ee6bafb..c8935d718207 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -37,10 +37,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
 	.hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy,
-	.address_watch_disable = kgd_gfx_v9_address_watch_disable,
-	.address_watch_execute = kgd_gfx_v9_address_watch_execute,
 	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
-	.address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info =
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index abe93b3ff765..4191af5a3f13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -289,10 +289,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
 	.hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy,
-	.address_watch_disable = kgd_gfx_v9_address_watch_disable,
-	.address_watch_execute = kgd_gfx_v9_address_watch_execute,
 	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
-	.address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info =
 				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base =
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 7b7f4b2764c1..9378fc79e9ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -671,20 +671,6 @@ static bool get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
 }
 
-static int kgd_address_watch_disable(struct amdgpu_device *adev)
-{
-	return 0;
-}
-
-static int kgd_address_watch_execute(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	return 0;
-}
-
 static int kgd_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd)
@@ -709,13 +695,6 @@ static int kgd_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
-static uint32_t kgd_address_watch_get_offset(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return 0;
-}
-
 static void set_vm_context_page_table_base(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t page_table_base)
 {
@@ -767,10 +746,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_hqd_destroy,
 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-	.address_watch_disable = kgd_address_watch_disable,
-	.address_watch_execute = kgd_address_watch_execute,
 	.wave_control_execute = kgd_wave_control_execute,
-	.address_watch_get_offset = kgd_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info =
 			get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
index 1f37d3574001..e9c80ce13f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
@@ -582,21 +582,6 @@ static int hqd_sdma_destroy_v10_3(struct amdgpu_device *adev, void *mqd,
 	return 0;
 }
 
-
-static int address_watch_disable_v10_3(struct amdgpu_device *adev)
-{
-	return 0;
-}
-
-static int address_watch_execute_v10_3(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	return 0;
-}
-
 static int wave_control_execute_v10_3(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd)
@@ -621,13 +606,6 @@ static int wave_control_execute_v10_3(struct amdgpu_device *adev,
 	return 0;
 }
 
-static uint32_t address_watch_get_offset_v10_3(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return 0;
-}
-
 static void set_vm_context_page_table_base_v10_3(struct amdgpu_device *adev,
 		uint32_t vmid, uint64_t page_table_base)
 {
@@ -809,10 +787,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
 	.hqd_sdma_is_occupied = hqd_sdma_is_occupied_v10_3,
 	.hqd_destroy = hqd_destroy_v10_3,
 	.hqd_sdma_destroy = hqd_sdma_destroy_v10_3,
-	.address_watch_disable = address_watch_disable_v10_3,
-	.address_watch_execute = address_watch_execute_v10_3,
 	.wave_control_execute = wave_control_execute_v10_3,
-	.address_watch_get_offset = address_watch_get_offset_v10_3,
 	.get_atc_vmid_pasid_mapping_info = NULL,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
 	.program_trap_handler_settings = program_trap_handler_settings_v10_3,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 36528dad7684..65552bb7d2f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -45,43 +45,6 @@ enum {
 	MAX_WATCH_ADDRESSES = 4
 };
 
-enum {
-	ADDRESS_WATCH_REG_ADDR_HI = 0,
-	ADDRESS_WATCH_REG_ADDR_LO,
-	ADDRESS_WATCH_REG_CNTL,
-	ADDRESS_WATCH_REG_MAX
-};
-
-/*  not defined in the CI/KV reg file  */
-enum {
-	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
-	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
-	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
-	/* extend the mask to 26 bits to match the low address field */
-	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
-	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
-};
-
-static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
-	mmTCP_WATCH0_ADDR_H, mmTCP_WATCH0_ADDR_L, mmTCP_WATCH0_CNTL,
-	mmTCP_WATCH1_ADDR_H, mmTCP_WATCH1_ADDR_L, mmTCP_WATCH1_CNTL,
-	mmTCP_WATCH2_ADDR_H, mmTCP_WATCH2_ADDR_L, mmTCP_WATCH2_CNTL,
-	mmTCP_WATCH3_ADDR_H, mmTCP_WATCH3_ADDR_L, mmTCP_WATCH3_CNTL
-};
-
-union TCP_WATCH_CNTL_BITS {
-	struct {
-		uint32_t mask:24;
-		uint32_t vmid:4;
-		uint32_t atc:1;
-		uint32_t mode:2;
-		uint32_t valid:1;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
 static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
 			uint32_t queue, uint32_t vmid)
 {
@@ -529,55 +492,6 @@ static int kgd_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
 	return 0;
 }
 
-static int kgd_address_watch_disable(struct amdgpu_device *adev)
-{
-	union TCP_WATCH_CNTL_BITS cntl;
-	unsigned int i;
-
-	cntl.u32All = 0;
-
-	cntl.bitfields.valid = 0;
-	cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
-	cntl.bitfields.atc = 1;
-
-	/* Turning off this address until we set all the registers */
-	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
-		WREG32(watchRegs[i * ADDRESS_WATCH_REG_MAX +
-			ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-
-	return 0;
-}
-
-static int kgd_address_watch_execute(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	union TCP_WATCH_CNTL_BITS cntl;
-
-	cntl.u32All = cntl_val;
-
-	/* Turning off this watch point until we set all the registers */
-	cntl.bitfields.valid = 0;
-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-
-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-		ADDRESS_WATCH_REG_ADDR_HI], addr_hi);
-
-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-		ADDRESS_WATCH_REG_ADDR_LO], addr_lo);
-
-	/* Enable the watch point */
-	cntl.bitfields.valid = 1;
-
-	WREG32(watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-		ADDRESS_WATCH_REG_CNTL], cntl.u32All);
-
-	return 0;
-}
-
 static int kgd_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd)
@@ -602,13 +516,6 @@ static int kgd_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
-static uint32_t kgd_address_watch_get_offset(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset];
-}
-
 static bool get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 					uint8_t vmid, uint16_t *p_pasid)
 {
@@ -665,10 +572,7 @@ const struct kfd2kgd_calls gfx_v7_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_hqd_destroy,
 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-	.address_watch_disable = kgd_address_watch_disable,
-	.address_watch_execute = kgd_address_watch_execute,
 	.wave_control_execute = kgd_wave_control_execute,
-	.address_watch_get_offset = kgd_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info,
 	.set_scratch_backing_va = set_scratch_backing_va,
 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 52832cd69a93..9dc5f2a0cc07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -538,20 +538,6 @@ static bool get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
 }
 
-static int kgd_address_watch_disable(struct amdgpu_device *adev)
-{
-	return 0;
-}
-
-static int kgd_address_watch_execute(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	return 0;
-}
-
 static int kgd_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd)
@@ -576,13 +562,6 @@ static int kgd_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
-static uint32_t kgd_address_watch_get_offset(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return 0;
-}
-
 static void set_scratch_backing_va(struct amdgpu_device *adev,
 					uint64_t va, uint32_t vmid)
 {
@@ -614,10 +593,7 @@ const struct kfd2kgd_calls gfx_v8_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_hqd_destroy,
 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-	.address_watch_disable = kgd_address_watch_disable,
-	.address_watch_execute = kgd_address_watch_execute,
 	.wave_control_execute = kgd_wave_control_execute,
-	.address_watch_get_offset = kgd_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info =
 			get_atc_vmid_pasid_mapping_info,
 	.set_scratch_backing_va = set_scratch_backing_va,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 1abf662a0e91..53895a41932e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -622,20 +622,6 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
 }
 
-int kgd_gfx_v9_address_watch_disable(struct amdgpu_device *adev)
-{
-	return 0;
-}
-
-int kgd_gfx_v9_address_watch_execute(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	return 0;
-}
-
 int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd)
@@ -660,13 +646,6 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 	return 0;
 }
 
-uint32_t kgd_gfx_v9_address_watch_get_offset(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return 0;
-}
-
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
 			uint32_t vmid, uint64_t page_table_base)
 {
@@ -888,10 +867,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-	.address_watch_disable = kgd_gfx_v9_address_watch_disable,
-	.address_watch_execute = kgd_gfx_v9_address_watch_execute,
 	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
-	.address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset,
 	.get_atc_vmid_pasid_mapping_info =
 			kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
 	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 24be49df26fd..c7ed3bc9053c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -46,19 +46,9 @@ int kgd_gfx_v9_hqd_destroy(struct amdgpu_device *adev, void *mqd,
 				enum kfd_preempt_type reset_type,
 				unsigned int utimeout, uint32_t pipe_id,
 				uint32_t queue_id);
-int kgd_gfx_v9_address_watch_disable(struct amdgpu_device *adev);
-int kgd_gfx_v9_address_watch_execute(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo);
 int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd);
-uint32_t kgd_gfx_v9_address_watch_get_offset(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset);
-
 bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
 					uint8_t vmid, uint16_t *p_pasid);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index c4f3aff11072..19cfbf9577b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -51,8 +51,6 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_events.o \
 		$(AMDKFD_PATH)/cik_event_interrupt.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
-		$(AMDKFD_PATH)/kfd_dbgdev.o \
-		$(AMDKFD_PATH)/kfd_dbgmgr.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
 		$(AMDKFD_PATH)/kfd_crat.o
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 64e3b4e3a712..13e46631e289 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,7 +39,6 @@
 #include <asm/processor.h>
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
-#include "kfd_dbgmgr.h"
 #include "kfd_svm.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
@@ -580,299 +579,26 @@ err_pdd:
 static int kfd_ioctl_dbg_register(struct file *filep,
 				struct kfd_process *p, void *data)
 {
-	struct kfd_ioctl_dbg_register_args *args = data;
-	struct kfd_dev *dev;
-	struct kfd_dbgmgr *dbgmgr_ptr;
-	struct kfd_process_device *pdd;
-	bool create_ok;
-	long status = 0;
-
-	mutex_lock(&p->mutex);
-	pdd = kfd_process_device_data_by_id(p, args->gpu_id);
-	if (!pdd) {
-		status = -EINVAL;
-		goto err_pdd;
-	}
-	dev = pdd->dev;
-
-	if (dev->adev->asic_type == CHIP_CARRIZO) {
-		pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
-		status = -EINVAL;
-		goto err_chip_unsupp;
-	}
-
-	mutex_lock(kfd_get_dbgmgr_mutex());
-
-	/*
-	 * make sure that we have pdd, if this the first queue created for
-	 * this process
-	 */
-	pdd = kfd_bind_process_to_device(dev, p);
-	if (IS_ERR(pdd)) {
-		status = PTR_ERR(pdd);
-		goto out;
-	}
-
-	if (!dev->dbgmgr) {
-		/* In case of a legal call, we have no dbgmgr yet */
-		create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev);
-		if (create_ok) {
-			status = kfd_dbgmgr_register(dbgmgr_ptr, p);
-			if (status != 0)
-				kfd_dbgmgr_destroy(dbgmgr_ptr);
-			else
-				dev->dbgmgr = dbgmgr_ptr;
-		}
-	} else {
-		pr_debug("debugger already registered\n");
-		status = -EINVAL;
-	}
-
-out:
-	mutex_unlock(kfd_get_dbgmgr_mutex());
-err_pdd:
-err_chip_unsupp:
-	mutex_unlock(&p->mutex);
-
-	return status;
+	return -EPERM;
 }
 
 static int kfd_ioctl_dbg_unregister(struct file *filep,
 				struct kfd_process *p, void *data)
 {
-	struct kfd_ioctl_dbg_unregister_args *args = data;
-	struct kfd_process_device *pdd;
-	struct kfd_dev *dev;
-	long status;
-
-	mutex_lock(&p->mutex);
-	pdd = kfd_process_device_data_by_id(p, args->gpu_id);
-	mutex_unlock(&p->mutex);
-	if (!pdd || !pdd->dev->dbgmgr)
-		return -EINVAL;
-
-	dev = pdd->dev;
-
-	if (dev->adev->asic_type == CHIP_CARRIZO) {
-		pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(kfd_get_dbgmgr_mutex());
-
-	status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
-	if (!status) {
-		kfd_dbgmgr_destroy(dev->dbgmgr);
-		dev->dbgmgr = NULL;
-	}
-
-	mutex_unlock(kfd_get_dbgmgr_mutex());
-
-	return status;
+	return -EPERM;
 }
 
-/*
- * Parse and generate variable size data structure for address watch.
- * Total size of the buffer and # watch points is limited in order
- * to prevent kernel abuse. (no bearing to the much smaller HW limitation
- * which is enforced by dbgdev module)
- * please also note that the watch address itself are not "copied from user",
- * since it be set into the HW in user mode values.
- *
- */
 static int kfd_ioctl_dbg_address_watch(struct file *filep,
 					struct kfd_process *p, void *data)
 {
-	struct kfd_ioctl_dbg_address_watch_args *args = data;
-	struct kfd_dev *dev;
-	struct kfd_process_device *pdd;
-	struct dbg_address_watch_info aw_info;
-	unsigned char *args_buff;
-	long status;
-	void __user *cmd_from_user;
-	uint64_t watch_mask_value = 0;
-	unsigned int args_idx = 0;
-
-	memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
-
-	mutex_lock(&p->mutex);
-	pdd = kfd_process_device_data_by_id(p, args->gpu_id);
-	mutex_unlock(&p->mutex);
-	if (!pdd) {
-		pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
-		return -EINVAL;
-	}
-	dev = pdd->dev;
-
-	if (dev->adev->asic_type == CHIP_CARRIZO) {
-		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
-		return -EINVAL;
-	}
-	cmd_from_user = (void __user *) args->content_ptr;
-
-	/* Validate arguments */
-
-	if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) ||
-		(args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) ||
-		(cmd_from_user == NULL))
-		return -EINVAL;
-
-	/* this is the actual buffer to work with */
-	args_buff = memdup_user(cmd_from_user,
-				args->buf_size_in_bytes - sizeof(*args));
-	if (IS_ERR(args_buff))
-		return PTR_ERR(args_buff);
-
-	aw_info.process = p;
-
-	aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
-	args_idx += sizeof(aw_info.num_watch_points);
-
-	aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
-	args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
-
-	/*
-	 * set watch address base pointer to point on the array base
-	 * within args_buff
-	 */
-	aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
-
-	/* skip over the addresses buffer */
-	args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
-
-	if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) {
-		status = -EINVAL;
-		goto out;
-	}
-
-	watch_mask_value = (uint64_t) args_buff[args_idx];
-
-	if (watch_mask_value > 0) {
-		/*
-		 * There is an array of masks.
-		 * set watch mask base pointer to point on the array base
-		 * within args_buff
-		 */
-		aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
-
-		/* skip over the masks buffer */
-		args_idx += sizeof(aw_info.watch_mask) *
-				aw_info.num_watch_points;
-	} else {
-		/* just the NULL mask, set to NULL and skip over it */
-		aw_info.watch_mask = NULL;
-		args_idx += sizeof(aw_info.watch_mask);
-	}
-
-	if (args_idx >= args->buf_size_in_bytes - sizeof(args)) {
-		status = -EINVAL;
-		goto out;
-	}
-
-	/* Currently HSA Event is not supported for DBG */
-	aw_info.watch_event = NULL;
-
-	mutex_lock(kfd_get_dbgmgr_mutex());
-
-	status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
-
-	mutex_unlock(kfd_get_dbgmgr_mutex());
-
-out:
-	kfree(args_buff);
-
-	return status;
+	return -EPERM;
 }
 
 /* Parse and generate fixed size data structure for wave control */
 static int kfd_ioctl_dbg_wave_control(struct file *filep,
 					struct kfd_process *p, void *data)
 {
-	struct kfd_ioctl_dbg_wave_control_args *args = data;
-	struct kfd_dev *dev;
-	struct kfd_process_device *pdd;
-	struct dbg_wave_control_info wac_info;
-	unsigned char *args_buff;
-	uint32_t computed_buff_size;
-	long status;
-	void __user *cmd_from_user;
-	unsigned int args_idx = 0;
-
-	memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info));
-
-	/* we use compact form, independent of the packing attribute value */
-	computed_buff_size = sizeof(*args) +
-				sizeof(wac_info.mode) +
-				sizeof(wac_info.operand) +
-				sizeof(wac_info.dbgWave_msg.DbgWaveMsg) +
-				sizeof(wac_info.dbgWave_msg.MemoryVA) +
-				sizeof(wac_info.trapId);
-
-	mutex_lock(&p->mutex);
-	pdd = kfd_process_device_data_by_id(p, args->gpu_id);
-	mutex_unlock(&p->mutex);
-	if (!pdd) {
-		pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
-		return -EINVAL;
-	}
-	dev = pdd->dev;
-
-	if (dev->adev->asic_type == CHIP_CARRIZO) {
-		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
-		return -EINVAL;
-	}
-
-	/* input size must match the computed "compact" size */
-	if (args->buf_size_in_bytes != computed_buff_size) {
-		pr_debug("size mismatch, computed : actual %u : %u\n",
-				args->buf_size_in_bytes, computed_buff_size);
-		return -EINVAL;
-	}
-
-	cmd_from_user = (void __user *) args->content_ptr;
-
-	if (cmd_from_user == NULL)
-		return -EINVAL;
-
-	/* copy the entire buffer from user */
-
-	args_buff = memdup_user(cmd_from_user,
-				args->buf_size_in_bytes - sizeof(*args));
-	if (IS_ERR(args_buff))
-		return PTR_ERR(args_buff);
-
-	/* move ptr to the start of the "pay-load" area */
-	wac_info.process = p;
-
-	wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
-	args_idx += sizeof(wac_info.operand);
-
-	wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
-	args_idx += sizeof(wac_info.mode);
-
-	wac_info.trapId = *((uint32_t *)(&args_buff[args_idx]));
-	args_idx += sizeof(wac_info.trapId);
-
-	wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value =
-					*((uint32_t *)(&args_buff[args_idx]));
-	wac_info.dbgWave_msg.MemoryVA = NULL;
-
-	mutex_lock(kfd_get_dbgmgr_mutex());
-
-	pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
-			wac_info.process, wac_info.operand,
-			wac_info.mode, wac_info.trapId,
-			wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
-
-	status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
-
-	pr_debug("Returned status of dbg manager is %ld\n", status);
-
-	mutex_unlock(kfd_get_dbgmgr_mutex());
-
-	kfree(args_buff);
-
-	return status;
+	return -EPERM;
 }
 
 static int kfd_ioctl_get_clock_counters(struct file *filep,
@@ -2900,16 +2626,16 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_WAIT_EVENTS,
 			kfd_ioctl_wait_events, 0),
 
-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER,
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER_DEPRECATED,
 			kfd_ioctl_dbg_register, 0),
 
-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER,
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED,
 			kfd_ioctl_dbg_unregister, 0),
 
-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH,
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED,
 			kfd_ioctl_dbg_address_watch, 0),
 
-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED,
 			kfd_ioctl_dbg_wave_control, 0),
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
deleted file mode 100644
index 8eca9ed3ab36..000000000000
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ /dev/null
@@ -1,845 +0,0 @@
-/*
- * Copyright 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/log2.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/device.h>
-
-#include "kfd_pm4_headers.h"
-#include "kfd_pm4_headers_diq.h"
-#include "kfd_kernel_queue.h"
-#include "kfd_priv.h"
-#include "kfd_pm4_opcodes.h"
-#include "cik_regs.h"
-#include "kfd_dbgmgr.h"
-#include "kfd_dbgdev.h"
-#include "kfd_device_queue_manager.h"
-
-static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
-{
-	dev->kfd2kgd->address_watch_disable(dev->adev);
-}
-
-static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
-				u32 pasid, uint64_t vmid0_address,
-				uint32_t *packet_buff, size_t size_in_bytes)
-{
-	struct pm4__release_mem *rm_packet;
-	struct pm4__indirect_buffer_pasid *ib_packet;
-	struct kfd_mem_obj *mem_obj;
-	size_t pq_packets_size_in_bytes;
-	union ULARGE_INTEGER *largep;
-	union ULARGE_INTEGER addr;
-	struct kernel_queue *kq;
-	uint64_t *rm_state;
-	unsigned int *ib_packet_buff;
-	int status;
-
-	if (WARN_ON(!size_in_bytes))
-		return -EINVAL;
-
-	kq = dbgdev->kq;
-
-	pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
-				sizeof(struct pm4__indirect_buffer_pasid);
-
-	/*
-	 * We acquire a buffer from DIQ
-	 * The receive packet buff will be sitting on the Indirect Buffer
-	 * and in the PQ we put the IB packet + sync packet(s).
-	 */
-	status = kq_acquire_packet_buffer(kq,
-				pq_packets_size_in_bytes / sizeof(uint32_t),
-				&ib_packet_buff);
-	if (status) {
-		pr_err("kq_acquire_packet_buffer failed\n");
-		return status;
-	}
-
-	memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
-
-	ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
-
-	ib_packet->header.count = 3;
-	ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
-	ib_packet->header.type = PM4_TYPE_3;
-
-	largep = (union ULARGE_INTEGER *) &vmid0_address;
-
-	ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
-	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
-
-	ib_packet->control = (1 << 23) | (1 << 31) |
-			((size_in_bytes / 4) & 0xfffff);
-
-	ib_packet->bitfields5.pasid = pasid;
-
-	/*
-	 * for now we use release mem for GPU-CPU synchronization
-	 * Consider WaitRegMem + WriteData as a better alternative
-	 * we get a GART allocations ( gpu/cpu mapping),
-	 * for the sync variable, and wait until:
-	 * (a) Sync with HW
-	 * (b) Sync var is written by CP to mem.
-	 */
-	rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
-			(sizeof(struct pm4__indirect_buffer_pasid) /
-					sizeof(unsigned int)));
-
-	status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
-					&mem_obj);
-
-	if (status) {
-		pr_err("Failed to allocate GART memory\n");
-		kq_rollback_packet(kq);
-		return status;
-	}
-
-	rm_state = (uint64_t *) mem_obj->cpu_ptr;
-
-	*rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
-
-	rm_packet->header.opcode = IT_RELEASE_MEM;
-	rm_packet->header.type = PM4_TYPE_3;
-	rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2;
-
-	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
-	rm_packet->bitfields2.event_index =
-				event_index___release_mem__end_of_pipe;
-
-	rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
-	rm_packet->bitfields2.atc = 0;
-	rm_packet->bitfields2.tc_wb_action_ena = 1;
-
-	addr.quad_part = mem_obj->gpu_addr;
-
-	rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
-	rm_packet->address_hi = addr.u.high_part;
-
-	rm_packet->bitfields3.data_sel =
-				data_sel___release_mem__send_64_bit_data;
-
-	rm_packet->bitfields3.int_sel =
-			int_sel___release_mem__send_data_after_write_confirm;
-
-	rm_packet->bitfields3.dst_sel =
-			dst_sel___release_mem__memory_controller;
-
-	rm_packet->data_lo = QUEUESTATE__ACTIVE;
-
-	kq_submit_packet(kq);
-
-	/* Wait till CP writes sync code: */
-	status = amdkfd_fence_wait_timeout(
-			rm_state,
-			QUEUESTATE__ACTIVE, 1500);
-
-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
-
-	return status;
-}
-
-static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
-{
-	/*
-	 * no action is needed in this case,
-	 * just make sure diq will not be used
-	 */
-
-	dbgdev->kq = NULL;
-
-	return 0;
-}
-
-static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
-{
-	struct queue_properties properties;
-	unsigned int qid;
-	struct kernel_queue *kq = NULL;
-	int status;
-
-	properties.type = KFD_QUEUE_TYPE_DIQ;
-
-	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
-				&properties, &qid, NULL, NULL, NULL, NULL);
-
-	if (status) {
-		pr_err("Failed to create DIQ\n");
-		return status;
-	}
-
-	pr_debug("DIQ Created with queue id: %d\n", qid);
-
-	kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
-
-	if (!kq) {
-		pr_err("Error getting DIQ\n");
-		pqm_destroy_queue(dbgdev->pqm, qid);
-		return -EFAULT;
-	}
-
-	dbgdev->kq = kq;
-
-	return status;
-}
-
-static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev)
-{
-	/* disable watch address */
-	dbgdev_address_watch_disable_nodiq(dbgdev->dev);
-	return 0;
-}
-
-static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev)
-{
-	/* todo - disable address watch */
-	int status;
-
-	status = pqm_destroy_queue(dbgdev->pqm,
-			dbgdev->kq->queue->properties.queue_id);
-	dbgdev->kq = NULL;
-
-	return status;
-}
-
-static void dbgdev_address_watch_set_registers(
-			const struct dbg_address_watch_info *adw_info,
-			union TCP_WATCH_ADDR_H_BITS *addrHi,
-			union TCP_WATCH_ADDR_L_BITS *addrLo,
-			union TCP_WATCH_CNTL_BITS *cntl,
-			unsigned int index, unsigned int vmid)
-{
-	union ULARGE_INTEGER addr;
-
-	addr.quad_part = 0;
-	addrHi->u32All = 0;
-	addrLo->u32All = 0;
-	cntl->u32All = 0;
-
-	if (adw_info->watch_mask)
-		cntl->bitfields.mask =
-			(uint32_t) (adw_info->watch_mask[index] &
-					ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
-	else
-		cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
-
-	addr.quad_part = (unsigned long long) adw_info->watch_address[index];
-
-	addrHi->bitfields.addr = addr.u.high_part &
-					ADDRESS_WATCH_REG_ADDHIGH_MASK;
-	addrLo->bitfields.addr =
-			(addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT);
-
-	cntl->bitfields.mode = adw_info->watch_mode[index];
-	cntl->bitfields.vmid = (uint32_t) vmid;
-	/* for now assume it is an ATC address */
-	cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
-
-	pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask);
-	pr_debug("\t\t%20s %08x\n", "set reg add high :",
-			addrHi->bitfields.addr);
-	pr_debug("\t\t%20s %08x\n", "set reg add low :",
-			addrLo->bitfields.addr);
-}
-
-static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev,
-				      struct dbg_address_watch_info *adw_info)
-{
-	union TCP_WATCH_ADDR_H_BITS addrHi;
-	union TCP_WATCH_ADDR_L_BITS addrLo;
-	union TCP_WATCH_CNTL_BITS cntl;
-	struct kfd_process_device *pdd;
-	unsigned int i;
-
-	/* taking the vmid for that process on the safe way using pdd */
-	pdd = kfd_get_process_device_data(dbgdev->dev,
-					adw_info->process);
-	if (!pdd) {
-		pr_err("Failed to get pdd for wave control no DIQ\n");
-		return -EFAULT;
-	}
-
-	addrHi.u32All = 0;
-	addrLo.u32All = 0;
-	cntl.u32All = 0;
-
-	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
-			(adw_info->num_watch_points == 0)) {
-		pr_err("num_watch_points is invalid\n");
-		return -EINVAL;
-	}
-
-	if (!adw_info->watch_mode || !adw_info->watch_address) {
-		pr_err("adw_info fields are not valid\n");
-		return -EINVAL;
-	}
-
-	for (i = 0; i < adw_info->num_watch_points; i++) {
-		dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
-						&cntl, i, pdd->qpd.vmid);
-
-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-		pr_debug("\t\t%20s %08x\n", "register index :", i);
-		pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid);
-		pr_debug("\t\t%20s %08x\n", "Address Low is :",
-				addrLo.bitfields.addr);
-		pr_debug("\t\t%20s %08x\n", "Address high is :",
-				addrHi.bitfields.addr);
-		pr_debug("\t\t%20s %08x\n", "Address high is :",
-				addrHi.bitfields.addr);
-		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
-				cntl.bitfields.mask);
-		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
-				cntl.bitfields.mode);
-		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
-				cntl.bitfields.vmid);
-		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
-				cntl.bitfields.atc);
-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-
-		pdd->dev->kfd2kgd->address_watch_execute(
-						dbgdev->dev->adev,
-						i,
-						cntl.u32All,
-						addrHi.u32All,
-						addrLo.u32All);
-	}
-
-	return 0;
-}
-
-static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
-				    struct dbg_address_watch_info *adw_info)
-{
-	struct pm4__set_config_reg *packets_vec;
-	union TCP_WATCH_ADDR_H_BITS addrHi;
-	union TCP_WATCH_ADDR_L_BITS addrLo;
-	union TCP_WATCH_CNTL_BITS cntl;
-	struct kfd_mem_obj *mem_obj;
-	unsigned int aw_reg_add_dword;
-	uint32_t *packet_buff_uint;
-	unsigned int i;
-	int status;
-	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
-	/* we do not control the vmid in DIQ mode, just a place holder */
-	unsigned int vmid = 0;
-
-	addrHi.u32All = 0;
-	addrLo.u32All = 0;
-	cntl.u32All = 0;
-
-	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
-			(adw_info->num_watch_points == 0)) {
-		pr_err("num_watch_points is invalid\n");
-		return -EINVAL;
-	}
-
-	if (!adw_info->watch_mode || !adw_info->watch_address) {
-		pr_err("adw_info fields are not valid\n");
-		return -EINVAL;
-	}
-
-	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
-
-	if (status) {
-		pr_err("Failed to allocate GART memory\n");
-		return status;
-	}
-
-	packet_buff_uint = mem_obj->cpu_ptr;
-
-	memset(packet_buff_uint, 0, ib_size);
-
-	packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
-
-	packets_vec[0].header.count = 1;
-	packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
-	packets_vec[0].header.type = PM4_TYPE_3;
-	packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
-	packets_vec[0].bitfields2.insert_vmid = 1;
-	packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
-	packets_vec[1].bitfields2.insert_vmid = 0;
-	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
-	packets_vec[2].bitfields2.insert_vmid = 0;
-	packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
-	packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
-	packets_vec[3].bitfields2.insert_vmid = 1;
-
-	for (i = 0; i < adw_info->num_watch_points; i++) {
-		dbgdev_address_watch_set_registers(adw_info,
-						&addrHi,
-						&addrLo,
-						&cntl,
-						i,
-						vmid);
-
-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-		pr_debug("\t\t%20s %08x\n", "register index :", i);
-		pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
-		pr_debug("\t\t%20s %p\n", "Add ptr is :",
-				adw_info->watch_address);
-		pr_debug("\t\t%20s %08llx\n", "Add     is :",
-				adw_info->watch_address[i]);
-		pr_debug("\t\t%20s %08x\n", "Address Low is :",
-				addrLo.bitfields.addr);
-		pr_debug("\t\t%20s %08x\n", "Address high is :",
-				addrHi.bitfields.addr);
-		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
-				cntl.bitfields.mask);
-		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
-				cntl.bitfields.mode);
-		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
-				cntl.bitfields.vmid);
-		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
-				cntl.bitfields.atc);
-		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-
-		aw_reg_add_dword =
-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
-					dbgdev->dev->adev,
-					i,
-					ADDRESS_WATCH_REG_CNTL);
-
-		packets_vec[0].bitfields2.reg_offset =
-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
-
-		packets_vec[0].reg_data[0] = cntl.u32All;
-
-		aw_reg_add_dword =
-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
-					dbgdev->dev->adev,
-					i,
-					ADDRESS_WATCH_REG_ADDR_HI);
-
-		packets_vec[1].bitfields2.reg_offset =
-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
-		packets_vec[1].reg_data[0] = addrHi.u32All;
-
-		aw_reg_add_dword =
-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
-					dbgdev->dev->adev,
-					i,
-					ADDRESS_WATCH_REG_ADDR_LO);
-
-		packets_vec[2].bitfields2.reg_offset =
-				aw_reg_add_dword - AMD_CONFIG_REG_BASE;
-		packets_vec[2].reg_data[0] = addrLo.u32All;
-
-		/* enable watch flag if address is not zero*/
-		if (adw_info->watch_address[i] > 0)
-			cntl.bitfields.valid = 1;
-		else
-			cntl.bitfields.valid = 0;
-
-		aw_reg_add_dword =
-				dbgdev->dev->kfd2kgd->address_watch_get_offset(
-					dbgdev->dev->adev,
-					i,
-					ADDRESS_WATCH_REG_CNTL);
-
-		packets_vec[3].bitfields2.reg_offset =
-					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
-		packets_vec[3].reg_data[0] = cntl.u32All;
-
-		status = dbgdev_diq_submit_ib(
-					dbgdev,
-					adw_info->process->pasid,
-					mem_obj->gpu_addr,
-					packet_buff_uint,
-					ib_size);
-
-		if (status) {
-			pr_err("Failed to submit IB to DIQ\n");
-			break;
-		}
-	}
-
-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
-	return status;
-}
-
-static int dbgdev_wave_control_set_registers(
-				struct dbg_wave_control_info *wac_info,
-				union SQ_CMD_BITS *in_reg_sq_cmd,
-				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index)
-{
-	int status = 0;
-	union SQ_CMD_BITS reg_sq_cmd;
-	union GRBM_GFX_INDEX_BITS reg_gfx_index;
-	struct HsaDbgWaveMsgAMDGen2 *pMsg;
-
-	reg_sq_cmd.u32All = 0;
-	reg_gfx_index.u32All = 0;
-	pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2;
-
-	switch (wac_info->mode) {
-	/* Send command to single wave */
-	case HSA_DBG_WAVEMODE_SINGLE:
-		/*
-		 * Limit access to the process waves only,
-		 * by setting vmid check
-		 */
-		reg_sq_cmd.bits.check_vmid = 1;
-		reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD;
-		reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId;
-		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE;
-
-		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
-		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
-		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
-
-		break;
-
-	/* Send command to all waves with matching VMID */
-	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS:
-
-		reg_gfx_index.bits.sh_broadcast_writes = 1;
-		reg_gfx_index.bits.se_broadcast_writes = 1;
-		reg_gfx_index.bits.instance_broadcast_writes = 1;
-
-		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
-
-		break;
-
-	/* Send command to all CU waves with matching VMID */
-	case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU:
-
-		reg_sq_cmd.bits.check_vmid = 1;
-		reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
-
-		reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
-		reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
-		reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
-
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	switch (wac_info->operand) {
-	case HSA_DBG_WAVEOP_HALT:
-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
-		break;
-
-	case HSA_DBG_WAVEOP_RESUME:
-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
-		break;
-
-	case HSA_DBG_WAVEOP_KILL:
-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_KILL;
-		break;
-
-	case HSA_DBG_WAVEOP_DEBUG:
-		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_DEBUG;
-		break;
-
-	case HSA_DBG_WAVEOP_TRAP:
-		if (wac_info->trapId < MAX_TRAPID) {
-			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_TRAP;
-			reg_sq_cmd.bits.trap_id = wac_info->trapId;
-		} else {
-			status = -EINVAL;
-		}
-		break;
-
-	default:
-		status = -EINVAL;
-		break;
-	}
-
-	if (status == 0) {
-		*in_reg_sq_cmd = reg_sq_cmd;
-		*in_reg_gfx_index = reg_gfx_index;
-	}
-
-	return status;
-}
-
-static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
-					struct dbg_wave_control_info *wac_info)
-{
-
-	int status;
-	union SQ_CMD_BITS reg_sq_cmd;
-	union GRBM_GFX_INDEX_BITS reg_gfx_index;
-	struct kfd_mem_obj *mem_obj;
-	uint32_t *packet_buff_uint;
-	struct pm4__set_config_reg *packets_vec;
-	size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;
-
-	reg_sq_cmd.u32All = 0;
-
-	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
-							&reg_gfx_index);
-	if (status) {
-		pr_err("Failed to set wave control registers\n");
-		return status;
-	}
-
-	/* we do not control the VMID in DIQ, so reset it to a known value */
-	reg_sq_cmd.bits.vm_id = 0;
-
-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
-	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
-	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
-	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
-	pr_debug("\t\t msg value is: %u\n",
-			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
-	pr_debug("\t\t vmid      is: N/A\n");
-
-	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
-	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
-	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
-	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
-	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
-	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
-	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
-
-	pr_debug("\t\t ibw       is : %u\n",
-			reg_gfx_index.bitfields.instance_broadcast_writes);
-	pr_debug("\t\t ii        is : %u\n",
-			reg_gfx_index.bitfields.instance_index);
-	pr_debug("\t\t sebw      is : %u\n",
-			reg_gfx_index.bitfields.se_broadcast_writes);
-	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
-	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
-	pr_debug("\t\t sbw       is : %u\n",
-			reg_gfx_index.bitfields.sh_broadcast_writes);
-
-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
-	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
-
-	if (status != 0) {
-		pr_err("Failed to allocate GART memory\n");
-		return status;
-	}
-
-	packet_buff_uint = mem_obj->cpu_ptr;
-
-	memset(packet_buff_uint, 0, ib_size);
-
-	packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
-	packets_vec[0].header.count = 1;
-	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
-	packets_vec[0].header.type = PM4_TYPE_3;
-	packets_vec[0].bitfields2.reg_offset =
-			GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
-
-	packets_vec[0].bitfields2.insert_vmid = 0;
-	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
-
-	packets_vec[1].header.count = 1;
-	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
-	packets_vec[1].header.type = PM4_TYPE_3;
-	packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE;
-
-	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
-	packets_vec[1].bitfields2.insert_vmid = 1;
-	packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
-
-	/* Restore the GRBM_GFX_INDEX register */
-
-	reg_gfx_index.u32All = 0;
-	reg_gfx_index.bits.sh_broadcast_writes = 1;
-	reg_gfx_index.bits.instance_broadcast_writes = 1;
-	reg_gfx_index.bits.se_broadcast_writes = 1;
-
-
-	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
-	packets_vec[2].bitfields2.reg_offset =
-				GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
-
-	packets_vec[2].bitfields2.insert_vmid = 0;
-	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
-
-	status = dbgdev_diq_submit_ib(
-			dbgdev,
-			wac_info->process->pasid,
-			mem_obj->gpu_addr,
-			packet_buff_uint,
-			ib_size);
-
-	if (status)
-		pr_err("Failed to submit IB to DIQ\n");
-
-	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
-
-	return status;
-}
-
-static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
-					struct dbg_wave_control_info *wac_info)
-{
-	int status;
-	union SQ_CMD_BITS reg_sq_cmd;
-	union GRBM_GFX_INDEX_BITS reg_gfx_index;
-	struct kfd_process_device *pdd;
-
-	reg_sq_cmd.u32All = 0;
-
-	/* taking the VMID for that process on the safe way using PDD */
-	pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process);
-
-	if (!pdd) {
-		pr_err("Failed to get pdd for wave control no DIQ\n");
-		return -EFAULT;
-	}
-	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
-							&reg_gfx_index);
-	if (status) {
-		pr_err("Failed to set wave control registers\n");
-		return status;
-	}
-
-	/* for non DIQ we need to patch the VMID: */
-
-	reg_sq_cmd.bits.vm_id = pdd->qpd.vmid;
-
-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
-	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
-	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
-	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
-	pr_debug("\t\t msg value is: %u\n",
-			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
-	pr_debug("\t\t vmid      is: %u\n", pdd->qpd.vmid);
-
-	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
-	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
-	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
-	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
-	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
-	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
-	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);
-
-	pr_debug("\t\t ibw       is : %u\n",
-			reg_gfx_index.bitfields.instance_broadcast_writes);
-	pr_debug("\t\t ii        is : %u\n",
-			reg_gfx_index.bitfields.instance_index);
-	pr_debug("\t\t sebw      is : %u\n",
-			reg_gfx_index.bitfields.se_broadcast_writes);
-	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
-	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
-	pr_debug("\t\t sbw       is : %u\n",
-			reg_gfx_index.bitfields.sh_broadcast_writes);
-
-	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
-	return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->adev,
-							reg_gfx_index.u32All,
-							reg_sq_cmd.u32All);
-}
-
-int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
-{
-	int status = 0;
-	unsigned int vmid;
-	uint16_t queried_pasid;
-	union SQ_CMD_BITS reg_sq_cmd;
-	union GRBM_GFX_INDEX_BITS reg_gfx_index;
-	struct kfd_process_device *pdd;
-	struct dbg_wave_control_info wac_info;
-	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
-	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
-
-	reg_sq_cmd.u32All = 0;
-	status = 0;
-
-	wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS;
-	wac_info.operand = HSA_DBG_WAVEOP_KILL;
-
-	pr_debug("Killing all process wavefronts\n");
-
-	/* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
-	 * ATC_VMID15_PASID_MAPPING
-	 * to check which VMID the current process is mapped to.
-	 */
-
-	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
-		status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info
-				(dev->adev, vmid, &queried_pasid);
-
-		if (status && queried_pasid == p->pasid) {
-			pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n",
-					vmid, p->pasid);
-			break;
-		}
-	}
-
-	if (vmid > last_vmid_to_scan) {
-		pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid);
-		return -EFAULT;
-	}
-
-	/* taking the VMID for that process on the safe way using PDD */
-	pdd = kfd_get_process_device_data(dev, p);
-	if (!pdd)
-		return -EFAULT;
-
-	status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
-			&reg_gfx_index);
-	if (status != 0)
-		return -EINVAL;
-
-	/* for non DIQ we need to patch the VMID: */
-	reg_sq_cmd.bits.vm_id = vmid;
-
-	dev->kfd2kgd->wave_control_execute(dev->adev,
-					reg_gfx_index.u32All,
-					reg_sq_cmd.u32All);
-
-	return 0;
-}
-
-void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
-			enum DBGDEV_TYPE type)
-{
-	pdbgdev->dev = pdev;
-	pdbgdev->kq = NULL;
-	pdbgdev->type = type;
-	pdbgdev->pqm = NULL;
-
-	switch (type) {
-	case DBGDEV_TYPE_NODIQ:
-		pdbgdev->dbgdev_register = dbgdev_register_nodiq;
-		pdbgdev->dbgdev_unregister = dbgdev_unregister_nodiq;
-		pdbgdev->dbgdev_wave_control = dbgdev_wave_control_nodiq;
-		pdbgdev->dbgdev_address_watch = dbgdev_address_watch_nodiq;
-		break;
-	case DBGDEV_TYPE_DIQ:
-	default:
-		pdbgdev->dbgdev_register = dbgdev_register_diq;
-		pdbgdev->dbgdev_unregister = dbgdev_unregister_diq;
-		pdbgdev->dbgdev_wave_control =  dbgdev_wave_control_diq;
-		pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq;
-		break;
-	}
-
-}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
deleted file mode 100644
index 0619c777b47e..000000000000
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef KFD_DBGDEV_H_
-#define KFD_DBGDEV_H_
-
-enum {
-	SQ_CMD_VMID_OFFSET = 28,
-	ADDRESS_WATCH_CNTL_OFFSET = 24
-};
-
-enum {
-	PRIV_QUEUE_SYNC_TIME_MS = 200
-};
-
-/* CONTEXT reg space definition */
-enum {
-	CONTEXT_REG_BASE = 0xA000,
-	CONTEXT_REG_END = 0xA400,
-	CONTEXT_REG_SIZE = CONTEXT_REG_END - CONTEXT_REG_BASE
-};
-
-/* USER CONFIG reg space definition */
-enum {
-	USERCONFIG_REG_BASE = 0xC000,
-	USERCONFIG_REG_END = 0x10000,
-	USERCONFIG_REG_SIZE = USERCONFIG_REG_END - USERCONFIG_REG_BASE
-};
-
-/* CONFIG reg space definition */
-enum {
-	AMD_CONFIG_REG_BASE = 0x2000,	/* in dwords */
-	AMD_CONFIG_REG_END = 0x2B00,
-	AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE
-};
-
-/* SH reg space definition */
-enum {
-	SH_REG_BASE = 0x2C00,
-	SH_REG_END = 0x3000,
-	SH_REG_SIZE = SH_REG_END - SH_REG_BASE
-};
-
-/* SQ_CMD definitions */
-#define SQ_CMD						0x8DEC
-
-enum SQ_IND_CMD_CMD {
-	SQ_IND_CMD_CMD_NULL = 0x00000000,
-	SQ_IND_CMD_CMD_HALT = 0x00000001,
-	SQ_IND_CMD_CMD_RESUME = 0x00000002,
-	SQ_IND_CMD_CMD_KILL = 0x00000003,
-	SQ_IND_CMD_CMD_DEBUG = 0x00000004,
-	SQ_IND_CMD_CMD_TRAP = 0x00000005,
-};
-
-enum SQ_IND_CMD_MODE {
-	SQ_IND_CMD_MODE_SINGLE = 0x00000000,
-	SQ_IND_CMD_MODE_BROADCAST = 0x00000001,
-	SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002,
-	SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003,
-	SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004,
-};
-
-union SQ_IND_INDEX_BITS {
-	struct {
-		uint32_t wave_id:4;
-		uint32_t simd_id:2;
-		uint32_t thread_id:6;
-		 uint32_t:1;
-		uint32_t force_read:1;
-		uint32_t read_timeout:1;
-		uint32_t unindexed:1;
-		uint32_t index:16;
-
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union SQ_IND_CMD_BITS {
-	struct {
-		uint32_t data:32;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union SQ_CMD_BITS {
-	struct {
-		uint32_t cmd:3;
-		 uint32_t:1;
-		uint32_t mode:3;
-		uint32_t check_vmid:1;
-		uint32_t trap_id:3;
-		 uint32_t:5;
-		uint32_t wave_id:4;
-		uint32_t simd_id:2;
-		 uint32_t:2;
-		uint32_t queue_id:3;
-		 uint32_t:1;
-		uint32_t vm_id:4;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union SQ_IND_DATA_BITS {
-	struct {
-		uint32_t data:32;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union GRBM_GFX_INDEX_BITS {
-	struct {
-		uint32_t instance_index:8;
-		uint32_t sh_index:8;
-		uint32_t se_index:8;
-		 uint32_t:5;
-		uint32_t sh_broadcast_writes:1;
-		uint32_t instance_broadcast_writes:1;
-		uint32_t se_broadcast_writes:1;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union TCP_WATCH_ADDR_H_BITS {
-	struct {
-		uint32_t addr:16;
-		 uint32_t:16;
-
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-union TCP_WATCH_ADDR_L_BITS {
-	struct {
-		uint32_t:6;
-		uint32_t addr:26;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-enum {
-	QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */
-	QUEUESTATE__ACTIVE_COMPLETION_PENDING,
-	QUEUESTATE__ACTIVE
-};
-
-union ULARGE_INTEGER {
-	struct {
-		uint32_t low_part;
-		uint32_t high_part;
-	} u;
-	unsigned long long quad_part;
-};
-
-
-#define KFD_CIK_VMID_START_OFFSET (8)
-#define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8))
-
-
-void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
-			enum DBGDEV_TYPE type);
-
-union TCP_WATCH_CNTL_BITS {
-	struct {
-		uint32_t mask:24;
-		uint32_t vmid:4;
-		uint32_t atc:1;
-		uint32_t mode:2;
-		uint32_t valid:1;
-	} bitfields, bits;
-	uint32_t u32All;
-	signed int i32All;
-	float f32All;
-};
-
-enum {
-	ADDRESS_WATCH_REG_CNTL_ATC_BIT = 0x10000000UL,
-	ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK = 0x00FFFFFF,
-	ADDRESS_WATCH_REG_ADDLOW_MASK_EXTENSION = 0x03000000,
-	/* extend the mask to 26 bits in order to match the low address field */
-	ADDRESS_WATCH_REG_ADDLOW_SHIFT = 6,
-	ADDRESS_WATCH_REG_ADDHIGH_MASK = 0xFFFF
-};
-
-enum {
-	MAX_TRAPID = 8,		/* 3 bits in the bitfield. */
-	MAX_WATCH_ADDRESSES = 4
-};
-
-enum {
-	ADDRESS_WATCH_REG_ADDR_HI = 0,
-	ADDRESS_WATCH_REG_ADDR_LO,
-	ADDRESS_WATCH_REG_CNTL,
-	ADDRESS_WATCH_REG_MAX
-};
-
-#endif	/* KFD_DBGDEV_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
deleted file mode 100644
index 9bfa50633654..000000000000
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/log2.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include "kfd_priv.h"
-#include "cik_regs.h"
-#include "kfd_pm4_headers.h"
-#include "kfd_pm4_headers_diq.h"
-#include "kfd_dbgmgr.h"
-#include "kfd_dbgdev.h"
-#include "kfd_device_queue_manager.h"
-
-static DEFINE_MUTEX(kfd_dbgmgr_mutex);
-
-struct mutex *kfd_get_dbgmgr_mutex(void)
-{
-	return &kfd_dbgmgr_mutex;
-}
-
-
-static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
-{
-	kfree(pmgr->dbgdev);
-
-	pmgr->dbgdev = NULL;
-	pmgr->pasid = 0;
-	pmgr->dev = NULL;
-}
-
-void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
-{
-	if (pmgr) {
-		kfd_dbgmgr_uninitialize(pmgr);
-		kfree(pmgr);
-	}
-}
-
-bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
-{
-	enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
-	struct kfd_dbgmgr *new_buff;
-
-	if (WARN_ON(!pdev->init_complete))
-		return false;
-
-	new_buff = kfd_alloc_struct(new_buff);
-	if (!new_buff) {
-		pr_err("Failed to allocate dbgmgr instance\n");
-		return false;
-	}
-
-	new_buff->pasid = 0;
-	new_buff->dev = pdev;
-	new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev);
-	if (!new_buff->dbgdev) {
-		pr_err("Failed to allocate dbgdev instance\n");
-		kfree(new_buff);
-		return false;
-	}
-
-	/* get actual type of DBGDevice cpsch or not */
-	if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
-		type = DBGDEV_TYPE_NODIQ;
-
-	kfd_dbgdev_init(new_buff->dbgdev, pdev, type);
-	*ppmgr = new_buff;
-
-	return true;
-}
-
-long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
-{
-	if (pmgr->pasid != 0) {
-		pr_debug("H/W debugger is already active using pasid 0x%x\n",
-				pmgr->pasid);
-		return -EBUSY;
-	}
-
-	/* remember pasid */
-	pmgr->pasid = p->pasid;
-
-	/* provide the pqm for diq generation */
-	pmgr->dbgdev->pqm = &p->pqm;
-
-	/* activate the actual registering */
-	pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
-
-	return 0;
-}
-
-long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
-{
-	/* Is the requests coming from the already registered process? */
-	if (pmgr->pasid != p->pasid) {
-		pr_debug("H/W debugger is not registered by calling pasid 0x%x\n",
-				p->pasid);
-		return -EINVAL;
-	}
-
-	pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
-
-	pmgr->pasid = 0;
-
-	return 0;
-}
-
-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
-				struct dbg_wave_control_info *wac_info)
-{
-	/* Is the requests coming from the already registered process? */
-	if (pmgr->pasid != wac_info->process->pasid) {
-		pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n",
-				wac_info->process->pasid);
-		return -EINVAL;
-	}
-
-	return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
-}
-
-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
-				struct dbg_address_watch_info *adw_info)
-{
-	/* Is the requests coming from the already registered process? */
-	if (pmgr->pasid != adw_info->process->pasid) {
-		pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n",
-				adw_info->process->pasid);
-		return -EINVAL;
-	}
-
-	return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev,
-							adw_info);
-}
-
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
deleted file mode 100644
index f9c6df1fdc5c..000000000000
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef KFD_DBGMGR_H_
-#define KFD_DBGMGR_H_
-
-#include "kfd_priv.h"
-
-/* must align with hsakmttypes definition */
-#pragma pack(push, 4)
-
-enum HSA_DBG_WAVEOP {
-	HSA_DBG_WAVEOP_HALT = 1,   /* Halts a wavefront */
-	HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */
-	HSA_DBG_WAVEOP_KILL = 3,   /* Kills a wavefront */
-	HSA_DBG_WAVEOP_DEBUG = 4,  /* Causes wavefront to enter dbg mode */
-	HSA_DBG_WAVEOP_TRAP = 5,   /* Causes wavefront to take a trap */
-	HSA_DBG_NUM_WAVEOP = 5,
-	HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF
-};
-
-enum HSA_DBG_WAVEMODE {
-	/* send command to a single wave */
-	HSA_DBG_WAVEMODE_SINGLE = 0,
-	/*
-	 * Broadcast to all wavefronts of all processes is not
-	 * supported for HSA user mode
-	 */
-
-	/* send to waves within current process */
-	HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2,
-	/* send to waves within current process on CU  */
-	HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3,
-	HSA_DBG_NUM_WAVEMODE = 3,
-	HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF
-};
-
-enum HSA_DBG_WAVEMSG_TYPE {
-	HSA_DBG_WAVEMSG_AUTO = 0,
-	HSA_DBG_WAVEMSG_USER = 1,
-	HSA_DBG_WAVEMSG_ERROR = 2,
-	HSA_DBG_NUM_WAVEMSG,
-	HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF
-};
-
-enum HSA_DBG_WATCH_MODE {
-	HSA_DBG_WATCH_READ = 0,		/* Read operations only */
-	HSA_DBG_WATCH_NONREAD = 1,	/* Write or Atomic operations only */
-	HSA_DBG_WATCH_ATOMIC = 2,	/* Atomic Operations only */
-	HSA_DBG_WATCH_ALL = 3,		/* Read, Write or Atomic operations */
-	HSA_DBG_WATCH_NUM,
-	HSA_DBG_WATCH_SIZE = 0xFFFFFFFF
-};
-
-/* This structure is hardware specific and may change in the future */
-struct HsaDbgWaveMsgAMDGen2 {
-	union {
-		struct ui32 {
-			uint32_t UserData:8;	/* user data */
-			uint32_t ShaderArray:1;	/* Shader array */
-			uint32_t Priv:1;	/* Privileged */
-			uint32_t Reserved0:4;	/* Reserved, should be 0 */
-			uint32_t WaveId:4;	/* wave id */
-			uint32_t SIMD:2;	/* SIMD id */
-			uint32_t HSACU:4;	/* Compute unit */
-			uint32_t ShaderEngine:2;/* Shader engine */
-			uint32_t MessageType:2;	/* see HSA_DBG_WAVEMSG_TYPE */
-			uint32_t Reserved1:4;	/* Reserved, should be 0 */
-		} ui32;
-		uint32_t Value;
-	};
-	uint32_t Reserved2;
-};
-
-union HsaDbgWaveMessageAMD {
-	struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
-	/* for future HsaDbgWaveMsgAMDGen3; */
-};
-
-struct HsaDbgWaveMessage {
-	void *MemoryVA;		/* ptr to associated host-accessible data */
-	union HsaDbgWaveMessageAMD DbgWaveMsg;
-};
-
-/*
- * TODO: This definitions to be MOVED to kfd_event, once it is implemented.
- *
- * HSA sync primitive, Event and HW Exception notification API definitions.
- * The API functions allow the runtime to define a so-called sync-primitive,
- * a SW object combining a user-mode provided "syncvar" and a scheduler event
- * that can be signaled through a defined GPU interrupt. A syncvar is
- * a process virtual memory location of a certain size that can be accessed
- * by CPU and GPU shader code within the process to set and query the content
- * within that memory. The definition of the content is determined by the HSA
- * runtime and potentially GPU shader code interfacing with the HSA runtime.
- * The syncvar values may be commonly written through an PM4 WRITE_DATA packet
- * in the user mode instruction stream. The OS scheduler event is typically
- * associated and signaled by an interrupt issued by the GPU, but other HSA
- * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced
- * by the KFD by this mechanism, too.
- */
-
-/* these are the new definitions for events */
-enum HSA_EVENTTYPE {
-	HSA_EVENTTYPE_SIGNAL = 0,	/* user-mode generated GPU signal */
-	HSA_EVENTTYPE_NODECHANGE = 1,	/* HSA node change (attach/detach) */
-	HSA_EVENTTYPE_DEVICESTATECHANGE = 2,	/* HSA device state change
-						 * (start/stop)
-						 */
-	HSA_EVENTTYPE_HW_EXCEPTION = 3,	/* GPU shader exception event */
-	HSA_EVENTTYPE_SYSTEM_EVENT = 4,	/* GPU SYSCALL with parameter info */
-	HSA_EVENTTYPE_DEBUG_EVENT = 5,	/* GPU signal for debugging */
-	HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */
-	HSA_EVENTTYPE_QUEUE_EVENT = 7,	/* GPU signal queue idle state
-					 * (EOP pm4)
-					 */
-	/* ...  */
-	HSA_EVENTTYPE_MAXID,
-	HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
-};
-
-/* Sub-definitions for various event types: Syncvar */
-struct HsaSyncVar {
-	union SyncVar {
-		void *UserData;	/* pointer to user mode data */
-		uint64_t UserDataPtrValue; /* 64bit compatibility of value */
-	} SyncVar;
-	uint64_t SyncVarSize;
-};
-
-/* Sub-definitions for various event types: NodeChange */
-
-enum HSA_EVENTTYPE_NODECHANGE_FLAGS {
-	HSA_EVENTTYPE_NODECHANGE_ADD = 0,
-	HSA_EVENTTYPE_NODECHANGE_REMOVE = 1,
-	HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF
-};
-
-struct HsaNodeChange {
-	/* HSA node added/removed on the platform */
-	enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags;
-};
-
-/* Sub-definitions for various event types: DeviceStateChange */
-enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
-	/* device started (and available) */
-	HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0,
-	/* device stopped (i.e. unavailable) */
-	HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1,
-	HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF
-};
-
-enum HSA_DEVICE {
-	HSA_DEVICE_CPU = 0,
-	HSA_DEVICE_GPU = 1,
-	MAX_HSA_DEVICE = 2
-};
-
-struct HsaDeviceStateChange {
-	uint32_t NodeId;	/* F-NUMA node that contains the device */
-	enum HSA_DEVICE Device;	/* device type: GPU or CPU */
-	enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */
-};
-
-struct HsaEventData {
-	enum HSA_EVENTTYPE EventType; /* event type */
-	union EventData {
-		/*
-		 * return data associated with HSA_EVENTTYPE_SIGNAL
-		 * and other events
-		 */
-		struct HsaSyncVar SyncVar;
-
-		/* data associated with HSA_EVENTTYPE_NODE_CHANGE */
-		struct HsaNodeChange NodeChangeState;
-
-		/* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */
-		struct HsaDeviceStateChange DeviceState;
-	} EventData;
-
-	/* the following data entries are internal to the KFD & thunk itself */
-
-	/* internal thunk store for Event data (OsEventHandle) */
-	uint64_t HWData1;
-	/* internal thunk store for Event data (HWAddress) */
-	uint64_t HWData2;
-	/* internal thunk store for Event data (HWData) */
-	uint32_t HWData3;
-};
-
-struct HsaEventDescriptor {
-	/* event type to allocate */
-	enum HSA_EVENTTYPE EventType;
-	/* H-NUMA node containing GPU device that is event source */
-	uint32_t NodeId;
-	/* pointer to user mode syncvar data, syncvar->UserDataPtrValue
-	 * may be NULL
-	 */
-	struct HsaSyncVar SyncVar;
-};
-
-struct HsaEvent {
-	uint32_t EventId;
-	struct HsaEventData EventData;
-};
-
-#pragma pack(pop)
-
-enum DBGDEV_TYPE {
-	DBGDEV_TYPE_ILLEGAL = 0,
-	DBGDEV_TYPE_NODIQ = 1,
-	DBGDEV_TYPE_DIQ = 2,
-	DBGDEV_TYPE_TEST = 3
-};
-
-struct dbg_address_watch_info {
-	struct kfd_process *process;
-	enum HSA_DBG_WATCH_MODE *watch_mode;
-	uint64_t *watch_address;
-	uint64_t *watch_mask;
-	struct HsaEvent *watch_event;
-	uint32_t num_watch_points;
-};
-
-struct dbg_wave_control_info {
-	struct kfd_process *process;
-	uint32_t trapId;
-	enum HSA_DBG_WAVEOP operand;
-	enum HSA_DBG_WAVEMODE mode;
-	struct HsaDbgWaveMessage dbgWave_msg;
-};
-
-struct kfd_dbgdev {
-
-	/* The device that owns this data. */
-	struct kfd_dev *dev;
-
-	/* kernel queue for DIQ */
-	struct kernel_queue *kq;
-
-	/* a pointer to the pqm of the calling process */
-	struct process_queue_manager *pqm;
-
-	/* type of debug device ( DIQ, non DIQ, etc. ) */
-	enum DBGDEV_TYPE type;
-
-	/* virtualized function pointers to device dbg */
-	int (*dbgdev_register)(struct kfd_dbgdev *dbgdev);
-	int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev);
-	int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev,
-				struct dbg_address_watch_info *adw_info);
-	int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev,
-				struct dbg_wave_control_info *wac_info);
-
-};
-
-struct kfd_dbgmgr {
-	u32 pasid;
-	struct kfd_dev *dev;
-	struct kfd_dbgdev *dbgdev;
-};
-
-/* prototypes for debug manager functions */
-struct mutex *kfd_get_dbgmgr_mutex(void);
-void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr);
-bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev);
-long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
-long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
-				struct dbg_wave_control_info *wac_info);
-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
-			struct dbg_address_watch_info *adw_info);
-#endif /* KFD_DBGMGR_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 5a47f437b455..dbb877fba724 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -575,8 +575,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
-	kfd->dbgmgr = NULL;
-
 	if (kfd_topology_add_device(kfd)) {
 		dev_err(kfd_device, "Error adding device to topology\n");
 		goto kfd_topology_add_device_error;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 219acc818eb4..7f6f1a842b0b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -480,6 +480,65 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
 	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
 }
 
+#define SQ_IND_CMD_CMD_KILL		0x00000003
+#define SQ_IND_CMD_MODE_BROADCAST	0x00000001
+
+static int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+{
+	int status = 0;
+	unsigned int vmid;
+	uint16_t queried_pasid;
+	union SQ_CMD_BITS reg_sq_cmd;
+	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+	struct kfd_process_device *pdd;
+	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
+
+	reg_sq_cmd.u32All = 0;
+	reg_gfx_index.u32All = 0;
+
+	pr_debug("Killing all process wavefronts\n");
+
+	/* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
+	 * ATC_VMID15_PASID_MAPPING
+	 * to check which VMID the current process is mapped to.
+	 */
+
+	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
+		status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info
+				(dev->adev, vmid, &queried_pasid);
+
+		if (status && queried_pasid == p->pasid) {
+			pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n",
+					vmid, p->pasid);
+			break;
+		}
+	}
+
+	if (vmid > last_vmid_to_scan) {
+		pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid);
+		return -EFAULT;
+	}
+
+	/* taking the VMID for that process on the safe way using PDD */
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd)
+		return -EFAULT;
+
+	reg_gfx_index.bits.sh_broadcast_writes = 1;
+	reg_gfx_index.bits.se_broadcast_writes = 1;
+	reg_gfx_index.bits.instance_broadcast_writes = 1;
+	reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
+	reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_KILL;
+	reg_sq_cmd.bits.vm_id = vmid;
+
+	dev->kfd2kgd->wave_control_execute(dev->adev,
+					reg_gfx_index.u32All,
+					reg_sq_cmd.u32All);
+
+	return 0;
+}
+
 /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
  * to avoid asynchronized access
  */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a7d2e3323977..05a9c17daa3e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -39,6 +39,41 @@ struct device_process_node {
 	struct list_head list;
 };
 
+union SQ_CMD_BITS {
+	struct {
+		uint32_t cmd:3;
+		uint32_t:1;
+		uint32_t mode:3;
+		uint32_t check_vmid:1;
+		uint32_t trap_id:3;
+		uint32_t:5;
+		uint32_t wave_id:4;
+		uint32_t simd_id:2;
+		uint32_t:2;
+		uint32_t queue_id:3;
+		uint32_t:1;
+		uint32_t vm_id:4;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
+union GRBM_GFX_INDEX_BITS {
+	struct {
+		uint32_t instance_index:8;
+		uint32_t sh_index:8;
+		uint32_t se_index:8;
+		uint32_t:5;
+		uint32_t sh_broadcast_writes:1;
+		uint32_t instance_broadcast_writes:1;
+		uint32_t se_broadcast_writes:1;
+	} bitfields, bits;
+	uint32_t u32All;
+	signed int i32All;
+	float f32All;
+};
+
 /**
  * struct device_queue_manager_ops
  *
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 66ad8d0b8f7f..fe62407dacb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -30,7 +30,6 @@
 #include <linux/pci.h>
 #include <linux/amd-iommu.h>
 #include "kfd_priv.h"
-#include "kfd_dbgmgr.h"
 #include "kfd_topology.h"
 #include "kfd_iommu.h"
 
@@ -163,17 +162,6 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, u32 pasid)
 
 	pr_debug("Unbinding process 0x%x from IOMMU\n", pasid);
 
-	mutex_lock(kfd_get_dbgmgr_mutex());
-
-	if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
-		if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
-			kfd_dbgmgr_destroy(dev->dbgmgr);
-			dev->dbgmgr = NULL;
-		}
-	}
-
-	mutex_unlock(kfd_get_dbgmgr_mutex());
-
 	mutex_lock(&p->mutex);
 
 	pdd = kfd_get_process_device_data(dev, p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 74fb4762c66e..b6790a637f5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -300,9 +300,6 @@ struct kfd_dev {
 	 */
 	bool interrupts_active;
 
-	/* Debug manager */
-	struct kfd_dbgmgr *dbgmgr;
-
 	/* Firmware versions */
 	uint16_t mec_fw_version;
 	uint16_t mec2_fw_version;
@@ -1331,8 +1328,6 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid);
 
 void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type);
 
-int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
-
 bool kfd_is_locked(void);
 
 /* Compute profile */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8e2780d2f735..8c6a48add76e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -40,7 +40,6 @@ struct mm_struct;
 
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
-#include "kfd_dbgmgr.h"
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
 
@@ -1158,7 +1157,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 					struct mm_struct *mm)
 {
 	struct kfd_process *p;
-	int i;
 
 	/*
 	 * The kfd_process structure can not be free because the
@@ -1178,23 +1176,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 
 	mutex_lock(&p->mutex);
 
-	/* Iterate over all process device data structures and if the
-	 * pdd is in debug mode, we should first force unregistration,
-	 * then we will be able to destroy the queues
-	 */
-	for (i = 0; i < p->n_pdds; i++) {
-		struct kfd_dev *dev = p->pdds[i]->dev;
-
-		mutex_lock(kfd_get_dbgmgr_mutex());
-		if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
-			if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
-				kfd_dbgmgr_destroy(dev->dbgmgr);
-				dev->dbgmgr = NULL;
-			}
-		}
-		mutex_unlock(kfd_get_dbgmgr_mutex());
-	}
-
 	kfd_process_dequeue_from_all_devices(p);
 	pqm_uninit(&p->pqm);
 
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index ac941f62cbed..2f60cf35a444 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -269,18 +269,9 @@ struct kfd2kgd_calls {
 	int (*hqd_sdma_destroy)(struct amdgpu_device *adev, void *mqd,
 				unsigned int timeout);
 
-	int (*address_watch_disable)(struct amdgpu_device *adev);
-	int (*address_watch_execute)(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo);
 	int (*wave_control_execute)(struct amdgpu_device *adev,
 					uint32_t gfx_index_val,
 					uint32_t sq_cmd);
-	uint32_t (*address_watch_get_offset)(struct amdgpu_device *adev,
-					unsigned int watch_point_id,
-					unsigned int reg_offset);
 	bool (*get_atc_vmid_pasid_mapping_info)(struct amdgpu_device *adev,
 					uint8_t vmid,
 					uint16_t *p_pasid);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index e6a56c146920..6e4268f5e482 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -756,16 +756,16 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_WAIT_EVENTS			\
 		AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args)
 
-#define AMDKFD_IOC_DBG_REGISTER			\
+#define AMDKFD_IOC_DBG_REGISTER_DEPRECATED	\
 		AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args)
 
-#define AMDKFD_IOC_DBG_UNREGISTER		\
+#define AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED	\
 		AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args)
 
-#define AMDKFD_IOC_DBG_ADDRESS_WATCH		\
+#define AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED	\
 		AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args)
 
-#define AMDKFD_IOC_DBG_WAVE_CONTROL		\
+#define AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED	\
 		AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
 
 #define AMDKFD_IOC_SET_SCRATCH_BACKING_VA	\
-- 
cgit v1.2.3


From 5cad527d5ffa9a1c4731bb9c97d2ee93f8960d50 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Wed, 9 Feb 2022 14:08:38 +0800
Subject: net: drop_monitor: support drop reason

In the commit c504e5c2f964 ("net: skb: introduce kfree_skb_reason()")
drop reason is introduced to the tracepoint of kfree_skb. Therefore,
drop_monitor is able to report the drop reason to users by netlink.

The drop reasons are reported as string to users, which is exactly
the same as what we do when reporting it to ftrace.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220209060838.55513-1-imagedong@tencent.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/net_dropmon.h |  1 +
 net/core/drop_monitor.c          | 41 ++++++++++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 66048cc5d7b3..1bbea8f0681e 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -93,6 +93,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_SW_DROPS,			/* flag */
 	NET_DM_ATTR_HW_DROPS,			/* flag */
 	NET_DM_ATTR_FLOW_ACTION_COOKIE,		/* binary */
+	NET_DM_ATTR_REASON,			/* string */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 7b288a121a41..4641126b8a20 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -48,6 +48,19 @@
 static int trace_state = TRACE_OFF;
 static bool monitor_hw;
 
+#undef EM
+#undef EMe
+
+#define EM(a, b)	[a] = #b,
+#define EMe(a, b)	[a] = #b
+
+/* drop_reasons is used to translate 'enum skb_drop_reason' to string,
+ * which is reported to user space.
+ */
+static const char * const drop_reasons[] = {
+	TRACE_SKB_DROP_REASON
+};
+
 /* net_dm_mutex
  *
  * An overall lock guarding every operation coming from userspace.
@@ -126,6 +139,7 @@ struct net_dm_skb_cb {
 		struct devlink_trap_metadata *hw_metadata;
 		void *pc;
 	};
+	enum skb_drop_reason reason;
 };
 
 #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -498,6 +512,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 {
 	ktime_t tstamp = ktime_get_real();
 	struct per_cpu_dm_data *data;
+	struct net_dm_skb_cb *cb;
 	struct sk_buff *nskb;
 	unsigned long flags;
 
@@ -508,7 +523,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 	if (!nskb)
 		return;
 
-	NET_DM_SKB_CB(nskb)->pc = location;
+	if ((unsigned int)reason >= SKB_DROP_REASON_MAX)
+		reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	cb = NET_DM_SKB_CB(nskb);
+	cb->reason = reason;
+	cb->pc = location;
 	/* Override the timestamp because we care about the time when the
 	 * packet was dropped.
 	 */
@@ -553,7 +572,8 @@ static size_t net_dm_in_port_size(void)
 
 #define NET_DM_MAX_SYMBOL_LEN 40
 
-static size_t net_dm_packet_report_size(size_t payload_len)
+static size_t net_dm_packet_report_size(size_t payload_len,
+					enum skb_drop_reason reason)
 {
 	size_t size;
 
@@ -574,6 +594,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
 	       nla_total_size(sizeof(u32)) +
 	       /* NET_DM_ATTR_PROTO */
 	       nla_total_size(sizeof(u16)) +
+	       /* NET_DM_ATTR_REASON */
+	       nla_total_size(strlen(drop_reasons[reason]) + 1) +
 	       /* NET_DM_ATTR_PAYLOAD */
 	       nla_total_size(payload_len);
 }
@@ -606,7 +628,7 @@ nla_put_failure:
 static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 				     size_t payload_len)
 {
-	u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+	struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
 	char buf[NET_DM_MAX_SYMBOL_LEN];
 	struct nlattr *attr;
 	void *hdr;
@@ -620,10 +642,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 	if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
 		goto nla_put_failure;
 
-	if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+	if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+			      NET_DM_ATTR_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_string(msg, NET_DM_ATTR_REASON,
+			   drop_reasons[cb->reason]))
 		goto nla_put_failure;
 
-	snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+	snprintf(buf, sizeof(buf), "%pS", cb->pc);
 	if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
 		goto nla_put_failure;
 
@@ -679,7 +706,9 @@ static void net_dm_packet_report(struct sk_buff *skb)
 	if (net_dm_trunc_len)
 		payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
 
-	msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+	msg = nlmsg_new(net_dm_packet_report_size(payload_len,
+						  NET_DM_SKB_CB(skb)->reason),
+			GFP_KERNEL);
 	if (!msg)
 		goto out;
 
-- 
cgit v1.2.3


From 7f5a08c79df35e68f1a43033450c5050f12bc155 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:15 -0800
Subject: user_events: Add minimal support for trace_event into ftrace

Minimal support for interacting with dynamic events, trace_event and
ftrace. Core outline of flow between user process, ioctl and trace_event
APIs.

User mode processes that wish to use trace events to get data into
ftrace, perf, eBPF, etc are limited to uprobes today. The user events
features enables an ABI for user mode processes to create and write to
trace events that are isolated from kernel level trace events. This
enables a faster path for tracing from user mode data as well as opens
managed code to participate in trace events, where stub locations are
dynamic.

User processes often want to trace only when it's useful. To enable this
a set of pages are mapped into the user process space that indicate the
current state of the user events that have been registered. User
processes can check if their event is hooked to a trace/probe, and if it
is, emit the event data out via the write() syscall.

Two new files are introduced into tracefs to accomplish this:
user_events_status - This file is mmap'd into participating user mode
processes to indicate event status.

user_events_data - This file is opened and register/delete ioctl's are
issued to create/open/delete trace events that can be used for tracing.

The typical scenario is on process start to mmap user_events_status. Processes
then register the events they plan to use via the REG ioctl. The ioctl reads
and updates the passed in user_reg struct. The status_index of the struct is
used to know the byte in the status page to check for that event. The
write_index of the struct is used to describe that event when writing out to
the fd that was used for the ioctl call. The data must always include this
index first when writing out data for an event. Data can be written either by
write() or by writev().

For example, in memory:
int index;
char data[];

Psuedo code example of typical usage:
struct user_reg reg;

int page_fd = open("user_events_status", O_RDWR);
char *page_data = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, page_fd, 0);
close(page_fd);

int data_fd = open("user_events_data", O_RDWR);

reg.size = sizeof(reg);
reg.name_args = (__u64)"test";

ioctl(data_fd, DIAG_IOCSREG, &reg);
int status_id = reg.status_index;
int write_id = reg.write_index;

struct iovec io[2];
io[0].iov_base = &write_id;
io[0].iov_len = sizeof(write_id);
io[1].iov_base = payload;
io[1].iov_len = sizeof(payload);

if (page_data[status_id])
	writev(data_fd, io, 2);

User events are also exposed via the dynamic_events tracefs file for
both create and delete. Current status is exposed via the user_events_status
tracefs file.

Simple example to register a user event via dynamic_events:
	echo u:test >> dynamic_events
	cat dynamic_events
	u:test

If an event is hooked to a probe, the probe hooked shows up:
	echo 1 > events/user_events/test/enable
	cat user_events_status
	1:test # Used by ftrace

	Active: 1
	Busy: 1
	Max: 4096

If an event is not hooked to a probe, no probe status shows up:
	echo 0 > events/user_events/test/enable
	cat user_events_status
	1:test

	Active: 1
	Busy: 0
	Max: 4096

Users can describe the trace event format via the following format:
	name[:FLAG1[,FLAG2...] [field1[;field2...]]

Each field has the following format:
	type name

Example for char array with a size of 20 named msg:
	echo 'u:detailed char[20] msg' >> dynamic_events
	cat dynamic_events
	u:detailed char[20] msg

Data offsets are based on the data written out via write() and will be
updated to reflect the correct offset in the trace_event fields. For dynamic
data it is recommended to use the new __rel_loc data type. This type will be
the same as __data_loc, but the offset is relative to this entry. This allows
user_events to not worry about what common fields are being inserted before
the data.

The above format is valid for both the ioctl and the dynamic_events file.

Link: https://lkml.kernel.org/r/20220118204326.2169-2-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/uapi/linux/user_events.h |  116 ++++
 kernel/trace/Kconfig             |   14 +
 kernel/trace/Makefile            |    1 +
 kernel/trace/trace_events_user.c | 1187 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 1318 insertions(+)
 create mode 100644 include/uapi/linux/user_events.h
 create mode 100644 kernel/trace/trace_events_user.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
new file mode 100644
index 000000000000..e570840571e1
--- /dev/null
+++ b/include/uapi/linux/user_events.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+#ifndef _UAPI_LINUX_USER_EVENTS_H
+#define _UAPI_LINUX_USER_EVENTS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifdef __KERNEL__
+#include <linux/uio.h>
+#else
+#include <sys/uio.h>
+#endif
+
+#define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_PREFIX "u:"
+
+/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
+#define EVENT_BIT_FTRACE 0
+#define EVENT_BIT_PERF 1
+#define EVENT_BIT_OTHER 7
+
+#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
+#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
+#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
+
+/* Create dynamic location entry within a 32-bit value */
+#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+
+/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */
+#define FLAG_BPF_ITER (1 << 0)
+
+/*
+ * Describes an event registration and stores the results of the registration.
+ * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
+ * must set the size and name_args before invocation.
+ */
+struct user_reg {
+
+	/* Input: Size of the user_reg structure being used */
+	__u32 size;
+
+	/* Input: Pointer to string with event name, description and flags */
+	__u64 name_args;
+
+	/* Output: Byte index of the event within the status page */
+	__u32 status_index;
+
+	/* Output: Index of the event to use when writing data */
+	__u32 write_index;
+};
+
+#define DIAG_IOC_MAGIC '*'
+
+/* Requests to register a user_event */
+#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
+
+/* Requests to delete a user_event */
+#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
+
+/* Data type that was passed to the BPF program */
+enum {
+	/* Data resides in kernel space */
+	USER_BPF_DATA_KERNEL,
+
+	/* Data resides in user space */
+	USER_BPF_DATA_USER,
+
+	/* Data is a pointer to a user_bpf_iter structure */
+	USER_BPF_DATA_ITER,
+};
+
+/*
+ * Describes an iovec iterator that BPF programs can use to access data for
+ * a given user_event write() / writev() call.
+ */
+struct user_bpf_iter {
+
+	/* Offset of the data within the first iovec */
+	__u32 iov_offset;
+
+	/* Number of iovec structures */
+	__u32 nr_segs;
+
+	/* Pointer to iovec structures */
+	const struct iovec *iov;
+};
+
+/* Context that BPF programs receive when attached to a user_event */
+struct user_bpf_context {
+
+	/* Data type being passed (see union below) */
+	__u32 data_type;
+
+	/* Length of the data */
+	__u32 data_len;
+
+	/* Pointer to data, varies by data type */
+	union {
+		/* Kernel data (data_type == USER_BPF_DATA_KERNEL) */
+		void *kdata;
+
+		/* User data (data_type == USER_BPF_DATA_USER) */
+		void *udata;
+
+		/* Direct iovec (data_type == USER_BPF_DATA_ITER) */
+		struct user_bpf_iter *iter;
+	};
+};
+
+#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..16a52a71732d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -737,6 +737,20 @@ config SYNTH_EVENTS
 
 	  If in doubt, say N.
 
+config USER_EVENTS
+	bool "User trace events"
+	select TRACING
+	select DYNAMIC_EVENTS
+	help
+	  User trace events are user-defined trace events that
+	  can be used like an existing kernel trace event.  User trace
+	  events are generated by writing to a tracefs file.  User
+	  processes can determine if their tracing events should be
+	  generated by memory mapping a tracefs file and checking for
+	  an associated byte being non-zero.
+
+	  If in doubt, say N.
+
 config HIST_TRIGGERS
 	bool "Histogram triggers"
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..19ef3758da95 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
 obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
 obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
 obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..77105233115e
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/user_events.h>
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be multiple of PAGE_SIZE.
+ */
+#define MAX_PAGES 1
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+	struct tracepoint tracepoint;
+	struct trace_event_call call;
+	struct trace_event_class class;
+	struct dyn_event devent;
+	struct hlist_node node;
+	struct list_head fields;
+	atomic_t refcnt;
+	int index;
+	int flags;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+	struct rcu_head rcu;
+	int count;
+	struct user_event *events[];
+};
+
+typedef void (*user_event_func_t) (struct user_event *user,
+				   void *data, u32 datalen,
+				   void *tpdata);
+
+static int user_event_parse(char *name, char *args, char *flags,
+			    struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+	return jhash(name, strlen(name), 0);
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+	struct user_event *user = (struct user_event *)call->data;
+
+	return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+	char *name = raw_command;
+	char *args = strpbrk(name, " ");
+	char *flags;
+
+	if (args)
+		*args++ = '\0';
+
+	flags = strpbrk(name, ":");
+
+	if (flags)
+		*flags++ = '\0';
+
+	return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+	const char *start = strchr(type, '[');
+	char val[8];
+	char *bracket;
+	int size = 0;
+
+	if (start == NULL)
+		return -EINVAL;
+
+	if (strscpy(val, start + 1, sizeof(val)) <= 0)
+		return -EINVAL;
+
+	bracket = strchr(val, ']');
+
+	if (!bracket)
+		return -EINVAL;
+
+	*bracket = '\0';
+
+	if (kstrtouint(val, 0, &size))
+		return -EINVAL;
+
+	if (size > MAX_FIELD_ARRAY_SIZE)
+		return -EINVAL;
+
+	return size;
+}
+
+static int user_field_size(const char *type)
+{
+	/* long is not allowed from a user, since it's ambigious in size */
+	if (strcmp(type, "s64") == 0)
+		return sizeof(s64);
+	if (strcmp(type, "u64") == 0)
+		return sizeof(u64);
+	if (strcmp(type, "s32") == 0)
+		return sizeof(s32);
+	if (strcmp(type, "u32") == 0)
+		return sizeof(u32);
+	if (strcmp(type, "int") == 0)
+		return sizeof(int);
+	if (strcmp(type, "unsigned int") == 0)
+		return sizeof(unsigned int);
+	if (strcmp(type, "s16") == 0)
+		return sizeof(s16);
+	if (strcmp(type, "u16") == 0)
+		return sizeof(u16);
+	if (strcmp(type, "short") == 0)
+		return sizeof(short);
+	if (strcmp(type, "unsigned short") == 0)
+		return sizeof(unsigned short);
+	if (strcmp(type, "s8") == 0)
+		return sizeof(s8);
+	if (strcmp(type, "u8") == 0)
+		return sizeof(u8);
+	if (strcmp(type, "char") == 0)
+		return sizeof(char);
+	if (strcmp(type, "unsigned char") == 0)
+		return sizeof(unsigned char);
+	if (str_has_prefix(type, "char["))
+		return user_field_array_size(type);
+	if (str_has_prefix(type, "unsigned char["))
+		return user_field_array_size(type);
+	if (str_has_prefix(type, "__data_loc "))
+		return sizeof(u32);
+	if (str_has_prefix(type, "__rel_loc "))
+		return sizeof(u32);
+
+	/* Uknown basic type, error */
+	return -EINVAL;
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+	struct ftrace_event_field *field, *next;
+	struct list_head *head = &user->fields;
+
+	list_for_each_entry_safe(field, next, head, link) {
+		list_del(&field->link);
+		kfree(field);
+	}
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+				const char *name, int offset, int size,
+				int is_signed, int filter_type)
+{
+	struct ftrace_event_field *field;
+
+	field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+	if (!field)
+		return -ENOMEM;
+
+	field->type = type;
+	field->name = name;
+	field->offset = offset;
+	field->size = size;
+	field->is_signed = is_signed;
+	field->filter_type = filter_type;
+
+	list_add(&field->link, &user->fields);
+
+	return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+				  u32 *offset)
+{
+	char *part, *type, *name;
+	u32 depth = 0, saved_offset = *offset;
+	int len, size = -EINVAL;
+	bool is_struct = false;
+
+	field = skip_spaces(field);
+
+	if (*field == '\0')
+		return 0;
+
+	/* Handle types that have a space within */
+	len = str_has_prefix(field, "unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "struct ");
+	if (len) {
+		is_struct = true;
+		goto skip_next;
+	}
+
+	len = str_has_prefix(field, "__data_loc unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__data_loc ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__rel_loc unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__rel_loc ");
+	if (len)
+		goto skip_next;
+
+	goto parse;
+skip_next:
+	type = field;
+	field = strpbrk(field + len, " ");
+
+	if (field == NULL)
+		return -EINVAL;
+
+	*field++ = '\0';
+	depth++;
+parse:
+	while ((part = strsep(&field, " ")) != NULL) {
+		switch (depth++) {
+		case FIELD_DEPTH_TYPE:
+			type = part;
+			break;
+		case FIELD_DEPTH_NAME:
+			name = part;
+			break;
+		case FIELD_DEPTH_SIZE:
+			if (!is_struct)
+				return -EINVAL;
+
+			if (kstrtou32(part, 10, &size))
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (depth < FIELD_DEPTH_SIZE)
+		return -EINVAL;
+
+	if (depth == FIELD_DEPTH_SIZE)
+		size = user_field_size(type);
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (size < 0)
+		return size;
+
+	*offset = saved_offset + size;
+
+	return user_event_add_field(user, type, name, saved_offset, size,
+				    type[0] != 'u', FILTER_OTHER);
+}
+
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+	char *flag;
+
+	if (flags == NULL)
+		return;
+
+	while ((flag = strsep(&flags, ",")) != NULL) {
+		if (strcmp(flag, "BPF_ITER") == 0)
+			user->flags |= FLAG_BPF_ITER;
+	}
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+	char *field;
+	u32 offset = sizeof(struct trace_entry);
+	int ret = -EINVAL;
+
+	if (args == NULL)
+		return 0;
+
+	while ((field = strsep(&args, ";")) != NULL) {
+		ret = user_event_parse_field(field, user, &offset);
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+						int flags,
+						struct trace_event *event)
+{
+	/* Unsafe to try to decode user provided print_fmt, use hex */
+	trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+				 1, iter->ent, iter->ent_size, true);
+
+	return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+	.trace = user_event_print_trace,
+};
+
+static int destroy_user_event(struct user_event *user)
+{
+	int ret = 0;
+
+	/* Must destroy fields before call removal */
+	user_event_destroy_fields(user);
+
+	ret = trace_remove_event_call(&user->call);
+
+	if (ret)
+		return ret;
+
+	dyn_event_remove(&user->devent);
+
+	register_page_data[user->index] = 0;
+	clear_bit(user->index, page_bitmap);
+	hash_del(&user->node);
+
+	kfree(EVENT_NAME(user));
+	kfree(user);
+
+	return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+	struct user_event *user;
+	u32 key = user_event_key(name);
+
+	*outkey = key;
+
+	hash_for_each_possible(register_table, user, node, key)
+		if (!strcmp(EVENT_NAME(user), name))
+			return user;
+
+	return NULL;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
+			      void *tpdata)
+{
+	struct trace_event_file *file;
+	struct trace_entry *entry;
+	struct trace_event_buffer event_buffer;
+
+	file = (struct trace_event_file *)tpdata;
+
+	if (!file ||
+	    !(file->flags & EVENT_FILE_FL_ENABLED) ||
+	    trace_trigger_soft_disabled(file))
+		return;
+
+	/* Allocates and fills trace_entry, + 1 of this is data payload */
+	entry = trace_event_buffer_reserve(&event_buffer, file,
+					   sizeof(*entry) + datalen);
+
+	if (unlikely(!entry))
+		return;
+
+	memcpy(entry + 1, data, datalen);
+
+	trace_event_buffer_commit(&event_buffer);
+}
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+	struct tracepoint *tp = &user->tracepoint;
+	char status = 0;
+
+	if (atomic_read(&tp->key.enabled) > 0) {
+		struct tracepoint_func *probe_func_ptr;
+		user_event_func_t probe_func;
+
+		rcu_read_lock_sched();
+
+		probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+		if (probe_func_ptr) {
+			do {
+				probe_func = probe_func_ptr->func;
+
+				if (probe_func == user_event_ftrace)
+					status |= EVENT_STATUS_FTRACE;
+				else
+					status |= EVENT_STATUS_OTHER;
+			} while ((++probe_func_ptr)->func);
+		}
+
+		rcu_read_unlock_sched();
+	}
+
+	register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+			  enum trace_reg type,
+			  void *data)
+{
+	struct user_event *user = (struct user_event *)call->data;
+	int ret = 0;
+
+	if (!user)
+		return -ENOENT;
+
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		ret = tracepoint_probe_register(call->tp,
+						call->class->probe,
+						data);
+		if (!ret)
+			goto inc;
+		break;
+
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->tp,
+					    call->class->probe,
+					    data);
+		goto dec;
+
+	default:
+		break;
+	}
+
+	return ret;
+inc:
+	atomic_inc(&user->refcnt);
+	update_reg_page_for(user);
+	return 0;
+dec:
+	update_reg_page_for(user);
+	atomic_dec(&user->refcnt);
+	return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+	struct user_event *user;
+	char *name;
+	int ret;
+
+	if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+		return -ECANCELED;
+
+	raw_command += USER_EVENTS_PREFIX_LEN;
+	raw_command = skip_spaces(raw_command);
+
+	name = kstrdup(raw_command, GFP_KERNEL);
+
+	if (!name)
+		return -ENOMEM;
+
+	mutex_lock(&reg_mutex);
+	ret = user_event_parse_cmd(name, &user);
+	mutex_unlock(&reg_mutex);
+
+	if (ret)
+		kfree(name);
+
+	return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+	struct ftrace_event_field *field, *next;
+	struct list_head *head;
+	int depth = 0;
+
+	seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+	head = trace_get_fields(&user->call);
+
+	list_for_each_entry_safe_reverse(field, next, head, link) {
+		if (depth == 0)
+			seq_puts(m, " ");
+		else
+			seq_puts(m, "; ");
+
+		seq_printf(m, "%s %s", field->type, field->name);
+
+		if (str_has_prefix(field->type, "struct "))
+			seq_printf(m, " %d", field->size);
+
+		depth++;
+	}
+
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	if (atomic_read(&user->refcnt) != 0)
+		return -EBUSY;
+
+	return destroy_user_event(user);
+}
+
+static bool user_event_match(const char *system, const char *event,
+			     int argc, const char **argv, struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	return strcmp(EVENT_NAME(user), event) == 0 &&
+		(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+}
+
+static struct dyn_event_operations user_event_dops = {
+	.create = user_event_create,
+	.show = user_event_show,
+	.is_busy = user_event_is_busy,
+	.free = user_event_free,
+	.match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+	int ret;
+
+	ret = register_trace_event(&user->call.event);
+
+	if (!ret)
+		return -ENODEV;
+
+	ret = trace_add_event_call(&user->call);
+
+	if (ret)
+		unregister_trace_event(&user->call.event);
+
+	return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+			    struct user_event **newuser)
+{
+	int ret;
+	int index;
+	u32 key;
+	struct user_event *user = find_user_event(name, &key);
+
+	if (user) {
+		*newuser = user;
+		/*
+		 * Name is allocated by caller, free it since it already exists.
+		 * Caller only worries about failure cases for freeing.
+		 */
+		kfree(name);
+		return 0;
+	}
+
+	index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+	if (index == MAX_EVENTS)
+		return -EMFILE;
+
+	user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+	if (!user)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&user->class.fields);
+	INIT_LIST_HEAD(&user->fields);
+
+	user->tracepoint.name = name;
+
+	user_event_parse_flags(user, flags);
+
+	ret = user_event_parse_fields(user, args);
+
+	if (ret)
+		goto put_user;
+
+	/* Minimal print format */
+	user->call.print_fmt = "\"\"";
+
+	user->call.data = user;
+	user->call.class = &user->class;
+	user->call.name = name;
+	user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+	user->call.tp = &user->tracepoint;
+	user->call.event.funcs = &user_event_funcs;
+
+	user->class.system = USER_EVENTS_SYSTEM;
+	user->class.fields_array = user_event_fields_array;
+	user->class.get_fields = user_event_get_fields;
+	user->class.reg = user_event_reg;
+	user->class.probe = user_event_ftrace;
+
+	mutex_lock(&event_mutex);
+	ret = user_event_trace_register(user);
+	mutex_unlock(&event_mutex);
+
+	if (ret)
+		goto put_user;
+
+	user->index = index;
+	dyn_event_init(&user->devent, &user_event_dops);
+	dyn_event_add(&user->devent, &user->call);
+	set_bit(user->index, page_bitmap);
+	hash_add(register_table, &user->node, key);
+
+	*newuser = user;
+	return 0;
+put_user:
+	user_event_destroy_fields(user);
+	kfree(user);
+	return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+	u32 key;
+	int ret;
+	struct user_event *user = find_user_event(name, &key);
+
+	if (!user)
+		return -ENOENT;
+
+	if (atomic_read(&user->refcnt) != 0)
+		return -EBUSY;
+
+	mutex_lock(&event_mutex);
+	ret = destroy_user_event(user);
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+	struct user_event_refs *refs;
+	struct user_event *user = NULL;
+	struct tracepoint *tp;
+	ssize_t ret = i->count;
+	int idx;
+
+	if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+		return -EFAULT;
+
+	rcu_read_lock_sched();
+
+	refs = rcu_dereference_sched(file->private_data);
+
+	/*
+	 * The refs->events array is protected by RCU, and new items may be
+	 * added. But the user retrieved from indexing into the events array
+	 * shall be immutable while the file is opened.
+	 */
+	if (likely(refs && idx < refs->count))
+		user = refs->events[idx];
+
+	rcu_read_unlock_sched();
+
+	if (unlikely(user == NULL))
+		return -ENOENT;
+
+	tp = &user->tracepoint;
+
+	/*
+	 * It's possible key.enabled disables after this check, however
+	 * we don't mind if a few events are included in this condition.
+	 */
+	if (likely(atomic_read(&tp->key.enabled) > 0)) {
+		struct tracepoint_func *probe_func_ptr;
+		user_event_func_t probe_func;
+		void *tpdata;
+		void *kdata;
+		u32 datalen;
+
+		kdata = kmalloc(i->count, GFP_KERNEL);
+
+		if (unlikely(!kdata))
+			return -ENOMEM;
+
+		datalen = copy_from_iter(kdata, i->count, i);
+
+		rcu_read_lock_sched();
+
+		probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+		if (probe_func_ptr) {
+			do {
+				probe_func = probe_func_ptr->func;
+				tpdata = probe_func_ptr->data;
+				probe_func(user, kdata, datalen, tpdata);
+			} while ((++probe_func_ptr)->func);
+		}
+
+		rcu_read_unlock_sched();
+
+		kfree(kdata);
+	}
+
+	return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+				 size_t count, loff_t *ppos)
+{
+	struct iovec iov;
+	struct iov_iter i;
+
+	if (unlikely(*ppos != 0))
+		return -EFAULT;
+
+	if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+		return -EFAULT;
+
+	return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+	return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+	struct user_event_refs *refs, *new_refs;
+	int i, size, count = 0;
+
+	refs = rcu_dereference_protected(file->private_data,
+					 lockdep_is_held(&reg_mutex));
+
+	if (refs) {
+		count = refs->count;
+
+		for (i = 0; i < count; ++i)
+			if (refs->events[i] == user)
+				return i;
+	}
+
+	size = struct_size(refs, events, count + 1);
+
+	new_refs = kzalloc(size, GFP_KERNEL);
+
+	if (!new_refs)
+		return -ENOMEM;
+
+	new_refs->count = count + 1;
+
+	for (i = 0; i < count; ++i)
+		new_refs->events[i] = refs->events[i];
+
+	new_refs->events[i] = user;
+
+	atomic_inc(&user->refcnt);
+
+	rcu_assign_pointer(file->private_data, new_refs);
+
+	if (refs)
+		kfree_rcu(refs, rcu);
+
+	return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+	u32 size;
+	long ret;
+
+	ret = get_user(size, &ureg->size);
+
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+	struct user_reg reg;
+	struct user_event *user;
+	char *name;
+	long ret;
+
+	ret = user_reg_get(ureg, &reg);
+
+	if (ret)
+		return ret;
+
+	name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+			    MAX_EVENT_DESC);
+
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		return ret;
+	}
+
+	ret = user_event_parse_cmd(name, &user);
+
+	if (ret) {
+		kfree(name);
+		return ret;
+	}
+
+	ret = user_events_ref_add(file, user);
+
+	/* Positive number is index and valid */
+	if (ret < 0)
+		return ret;
+
+	put_user((u32)ret, &ureg->write_index);
+	put_user(user->index, &ureg->status_index);
+
+	return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+	void __user *ubuf = (void __user *)uarg;
+	char *name;
+	long ret;
+
+	name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	ret = delete_user_event(name);
+
+	kfree(name);
+
+	return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+			      unsigned long uarg)
+{
+	long ret = -ENOTTY;
+
+	switch (cmd) {
+	case DIAG_IOCSREG:
+		mutex_lock(&reg_mutex);
+		ret = user_events_ioctl_reg(file, uarg);
+		mutex_unlock(&reg_mutex);
+		break;
+
+	case DIAG_IOCSDEL:
+		mutex_lock(&reg_mutex);
+		ret = user_events_ioctl_del(file, uarg);
+		mutex_unlock(&reg_mutex);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+	struct user_event_refs *refs;
+	struct user_event *user;
+	int i;
+
+	/*
+	 * Ensure refs cannot change under any situation by taking the
+	 * register mutex during the final freeing of the references.
+	 */
+	mutex_lock(&reg_mutex);
+
+	refs = file->private_data;
+
+	if (!refs)
+		goto out;
+
+	/*
+	 * The lifetime of refs has reached an end, it's tied to this file.
+	 * The underlying user_events are ref counted, and cannot be freed.
+	 * After this decrement, the user_events may be freed elsewhere.
+	 */
+	for (i = 0; i < refs->count; ++i) {
+		user = refs->events[i];
+
+		if (user)
+			atomic_dec(&user->refcnt);
+	}
+out:
+	file->private_data = NULL;
+
+	mutex_unlock(&reg_mutex);
+
+	kfree(refs);
+
+	return 0;
+}
+
+static const struct file_operations user_data_fops = {
+	.write = user_events_write,
+	.write_iter = user_events_write_iter,
+	.unlocked_ioctl	= user_events_ioctl,
+	.release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+
+	if (size != MAX_EVENTS)
+		return -EINVAL;
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       virt_to_phys(register_page_data) >> PAGE_SHIFT,
+			       size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos)
+		return NULL;
+
+	return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+	struct user_event *user;
+	char status;
+	int i, active = 0, busy = 0, flags;
+
+	mutex_lock(&reg_mutex);
+
+	hash_for_each(register_table, i, user, node) {
+		status = register_page_data[user->index];
+		flags = user->flags;
+
+		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+		if (flags != 0 || status != 0)
+			seq_puts(m, " #");
+
+		if (status != 0) {
+			seq_puts(m, " Used by");
+			if (status & EVENT_STATUS_FTRACE)
+				seq_puts(m, " ftrace");
+			if (status & EVENT_STATUS_PERF)
+				seq_puts(m, " perf");
+			if (status & EVENT_STATUS_OTHER)
+				seq_puts(m, " other");
+			busy++;
+		}
+
+		if (flags & FLAG_BPF_ITER)
+			seq_puts(m, " FLAG:BPF_ITER");
+
+		seq_puts(m, "\n");
+		active++;
+	}
+
+	mutex_unlock(&reg_mutex);
+
+	seq_puts(m, "\n");
+	seq_printf(m, "Active: %d\n", active);
+	seq_printf(m, "Busy: %d\n", busy);
+	seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+	return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+	.start = user_seq_start,
+	.next  = user_seq_next,
+	.stop  = user_seq_stop,
+	.show  = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+	return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+	.open = user_status_open,
+	.mmap = user_status_mmap,
+	.read = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+	struct dentry *edata, *emmap;
+
+	edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+				    NULL, NULL, &user_data_fops);
+
+	if (!edata) {
+		pr_warn("Could not create tracefs 'user_events_data' entry\n");
+		goto err;
+	}
+
+	/* mmap with MAP_SHARED requires writable fd */
+	emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+				    NULL, NULL, &user_status_fops);
+
+	if (!emmap) {
+		tracefs_remove(edata);
+		pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+		goto err;
+	}
+
+	return 0;
+err:
+	return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+	int page;
+
+	for (page = 0; page < MAX_PAGES; ++page) {
+		void *addr = register_page_data + (PAGE_SIZE * page);
+
+		if (set)
+			SetPageReserved(virt_to_page(addr));
+		else
+			ClearPageReserved(virt_to_page(addr));
+	}
+}
+
+static int __init trace_events_user_init(void)
+{
+	int ret;
+
+	/* Zero all bits beside 0 (which is reserved for failures) */
+	bitmap_zero(page_bitmap, MAX_EVENTS);
+	set_bit(0, page_bitmap);
+
+	register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL);
+
+	if (!register_page_data)
+		return -ENOMEM;
+
+	set_page_reservations(true);
+
+	ret = create_user_tracefs();
+
+	if (ret) {
+		pr_warn("user_events could not register with tracefs\n");
+		set_page_reservations(false);
+		kfree(register_page_data);
+		return ret;
+	}
+
+	if (dyn_event_register(&user_event_dops))
+		pr_warn("user_events could not register with dyn_events\n");
+
+	return 0;
+}
+
+fs_initcall(trace_events_user_init);
-- 
cgit v1.2.3


From a6a6fe27bab48f0d09a64b051e7bde432fcae081 Mon Sep 17 00:00:00 2001
From: "D. Wythe" <alibuda@linux.alibaba.com>
Date: Thu, 10 Feb 2022 17:11:37 +0800
Subject: net/smc: Dynamic control handshake limitation by socket options

This patch aims to add dynamic control for SMC handshake limitation for
every smc sockets, in production environment, it is possible for the
same applications to handle different service types, and may have
different opinion on SMC handshake limitation.

This patch try socket options to complete it, since we don't have socket
option level for SMC yet, which requires us to implement it at the same
time.

This patch does the following:

- add new socket option level: SOL_SMC.
- add new SMC socket option: SMC_LIMIT_HS.
- provide getter/setter for SMC socket options.

Link: https://lore.kernel.org/all/20f504f961e1a803f85d64229ad84260434203bd.1644323503.git.alibuda@linux.alibaba.com/
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h   |  1 +
 include/uapi/linux/smc.h |  4 +++
 net/smc/af_smc.c         | 69 +++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc.h            |  1 +
 4 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8ef26d89ef49..6f85f5d957ef 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -366,6 +366,7 @@ struct ucred {
 #define SOL_XDP		283
 #define SOL_MPTCP	284
 #define SOL_MCTP	285
+#define SOL_SMC		286
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 6c2874fd2c00..343e7450c3a3 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -284,4 +284,8 @@ enum {
 	__SMC_NLA_SEID_TABLE_MAX,
 	SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1
 };
+
+/* SMC socket options */
+#define SMC_LIMIT_HS 1	/* constraint on smc handshake */
+
 #endif /* _UAPI_LINUX_SMC_H */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index a05ffb268c3e..97dcdc0a2107 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2326,7 +2326,8 @@ static int smc_listen(struct socket *sock, int backlog)
 
 	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
 
-	tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+	if (smc->limit_smc_hs)
+		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
 
 	rc = kernel_listen(smc->clcsock, backlog);
 	if (rc) {
@@ -2621,6 +2622,67 @@ out:
 	return rc ? rc : rc1;
 }
 
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	struct smc_sock *smc;
+	int val, len;
+
+	smc = smc_sk(sock->sk);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case SMC_LIMIT_HS:
+		val = smc->limit_smc_hs;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+			    sockptr_t optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct smc_sock *smc;
+	int val, rc;
+
+	smc = smc_sk(sk);
+
+	lock_sock(sk);
+	switch (optname) {
+	case SMC_LIMIT_HS:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (copy_from_sockptr(&val, optval, sizeof(int)))
+			return -EFAULT;
+
+		smc->limit_smc_hs = !!val;
+		rc = 0;
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+		break;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
 static int smc_setsockopt(struct socket *sock, int level, int optname,
 			  sockptr_t optval, unsigned int optlen)
 {
@@ -2630,6 +2692,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 
 	if (level == SOL_TCP && optname == TCP_ULP)
 		return -EOPNOTSUPP;
+	else if (level == SOL_SMC)
+		return __smc_setsockopt(sock, level, optname, optval, optlen);
 
 	smc = smc_sk(sk);
 
@@ -2712,6 +2776,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
 	struct smc_sock *smc;
 	int rc;
 
+	if (level == SOL_SMC)
+		return __smc_getsockopt(sock, level, optname, optval, optlen);
+
 	smc = smc_sk(sock->sk);
 	mutex_lock(&smc->clcsock_release_lock);
 	if (!smc->clcsock) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e91e40040d07..7e2693832a1b 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -249,6 +249,7 @@ struct smc_sock {				/* smc sock container */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
+	bool			limit_smc_hs;	/* put constraint on handshake */
 	bool			use_fallback;	/* fallback to tcp */
 	int			fallback_rsn;	/* reason for fallback */
 	u32			peer_diagnosis; /* decline reason from peer */
-- 
cgit v1.2.3


From f9496b7c1b48ce02cd17a3ee88b1e049c689a222 Mon Sep 17 00:00:00 2001
From: "D. Wythe" <alibuda@linux.alibaba.com>
Date: Thu, 10 Feb 2022 17:11:38 +0800
Subject: net/smc: Add global configure for handshake limitation by netlink

Although we can control SMC handshake limitation through socket options,
which means that applications who need it must modify their code. It's
quite troublesome for many existing applications. This patch modifies
the global default value of SMC handshake limitation through netlink,
providing a way to put constraint on handshake without modifies any code
for applications.

Suggested-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/smc.h  |  2 ++
 include/uapi/linux/smc.h | 11 +++++++++++
 net/smc/af_smc.c         | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc.h            |  6 ++++++
 net/smc/smc_netlink.c    | 15 +++++++++++++++
 net/smc/smc_pnet.c       |  3 +++
 6 files changed, 79 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index ea8a9cf2619b..47b166684fd8 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -12,5 +12,7 @@ struct netns_smc {
 	/* protect fback_rsn */
 	struct mutex			mutex_fback_rsn;
 	struct smc_stats_rsn		*fback_rsn;
+
+	bool				limit_smc_hs;	/* constraint on handshake */
 };
 #endif
diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 343e7450c3a3..693f549f6966 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -59,6 +59,9 @@ enum {
 	SMC_NETLINK_DUMP_SEID,
 	SMC_NETLINK_ENABLE_SEID,
 	SMC_NETLINK_DISABLE_SEID,
+	SMC_NETLINK_DUMP_HS_LIMITATION,
+	SMC_NETLINK_ENABLE_HS_LIMITATION,
+	SMC_NETLINK_DISABLE_HS_LIMITATION,
 };
 
 /* SMC_GENL_FAMILY top level attributes */
@@ -285,6 +288,14 @@ enum {
 	SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1
 };
 
+/* SMC_NETLINK_HS_LIMITATION attributes */
+enum {
+	SMC_NLA_HS_LIMITATION_UNSPEC,
+	SMC_NLA_HS_LIMITATION_ENABLED,	/* u8 */
+	__SMC_NLA_HS_LIMITATION_MAX,
+	SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1
+};
+
 /* SMC socket options */
 #define SMC_LIMIT_HS 1	/* constraint on smc handshake */
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 97dcdc0a2107..246c874de629 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -66,6 +66,45 @@ struct workqueue_struct	*smc_close_wq;	/* wq for close work */
 static void smc_tcp_listen_work(struct work_struct *);
 static void smc_connect_work(struct work_struct *);
 
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+	void *hdr;
+
+	if (cb_ctx->pos[0])
+		goto out;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &smc_gen_nl_family, NLM_F_MULTI,
+			  SMC_NETLINK_DUMP_HS_LIMITATION);
+	if (!hdr)
+		return -ENOMEM;
+
+	if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+		       sock_net(skb->sk)->smc.limit_smc_hs))
+		goto err;
+
+	genlmsg_end(skb, hdr);
+	cb_ctx->pos[0] = 1;
+out:
+	return skb->len;
+err:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+	sock_net(skb->sk)->smc.limit_smc_hs = true;
+	return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+	sock_net(skb->sk)->smc.limit_smc_hs = false;
+	return 0;
+}
+
 static void smc_set_keepalive(struct sock *sk, int val)
 {
 	struct smc_sock *smc = smc_sk(sk);
@@ -3007,6 +3046,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
 	smc->use_fallback = false; /* assume rdma capability first */
 	smc->fallback_rsn = 0;
 
+	/* default behavior from limit_smc_hs in every net namespace */
+	smc->limit_smc_hs = net->smc.limit_smc_hs;
+
 	rc = 0;
 	if (!clcsock) {
 		rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 7e2693832a1b..a096d8af21a0 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,6 +14,7 @@
 #include <linux/socket.h>
 #include <linux/types.h>
 #include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
 #include <net/sock.h>
 
 #include "smc_ib.h"
@@ -336,4 +337,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr,
 		       struct smc_gidlist *gidlist,
 		       struct smc_ib_device *known_dev, u8 *known_gid);
 
+/* smc handshake limitation interface for netlink  */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
index f13ab0661ed5..c5a62f6f52ba 100644
--- a/net/smc/smc_netlink.c
+++ b/net/smc/smc_netlink.c
@@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.doit = smc_nl_disable_seid,
 	},
+	{
+		.cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+		/* can be retrieved by unprivileged users */
+		.dumpit = smc_nl_dump_hs_limitation,
+	},
+	{
+		.cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+		.flags = GENL_ADMIN_PERM,
+		.doit = smc_nl_enable_hs_limitation,
+	},
+	{
+		.cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+		.flags = GENL_ADMIN_PERM,
+		.doit = smc_nl_disable_hs_limitation,
+	},
 };
 
 static const struct nla_policy smc_gen_nl_policy[2] = {
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 0599246c0376..ff61b7b95875 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -870,6 +870,9 @@ int smc_pnet_net_init(struct net *net)
 
 	smc_pnet_create_pnetids_list(net);
 
+	/* disable handshake limitation by default */
+	net->smc.limit_smc_hs = 0;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9e9feebcbc14b174fef862842f8cc9a388e1db3 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Fri, 11 Feb 2022 19:22:10 +0100
Subject: KVM: s390: Add optional storage key checking to MEMOP IOCTL

User space needs a mechanism to perform key checked accesses when
emulating instructions.

The key can be passed as an additional argument.
Having an additional argument is flexible, as user space can
pass the guest PSW's key, in order to make an access the same way the
CPU would, or pass another key if necessary.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20220211182215.2730017-6-scgl@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 31 +++++++++++++++++++++----------
 include/uapi/linux/kvm.h |  6 +++++-
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fdbd6c1dc709..c31b40abfa23 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2359,6 +2359,11 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 	return r;
 }
 
+static bool access_key_invalid(u8 access_key)
+{
+	return access_key > 0xf;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -4692,17 +4697,21 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 	void *tmpbuf = NULL;
 	int r = 0;
 	const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
-				    | KVM_S390_MEMOP_F_CHECK_ONLY;
+				    | KVM_S390_MEMOP_F_CHECK_ONLY
+				    | KVM_S390_MEMOP_F_SKEY_PROTECTION;
 
 	if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
 		return -EINVAL;
-
 	if (mop->size > MEM_OP_MAX_SIZE)
 		return -E2BIG;
-
 	if (kvm_s390_pv_cpu_is_protected(vcpu))
 		return -EINVAL;
-
+	if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+		if (access_key_invalid(mop->key))
+			return -EINVAL;
+	} else {
+		mop->key = 0;
+	}
 	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
 		tmpbuf = vmalloc(mop->size);
 		if (!tmpbuf)
@@ -4712,11 +4721,12 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 	switch (mop->op) {
 	case KVM_S390_MEMOP_LOGICAL_READ:
 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
-					    mop->size, GACC_FETCH, 0);
+			r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+					    GACC_FETCH, mop->key);
 			break;
 		}
-		r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+		r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+					mop->size, mop->key);
 		if (r == 0) {
 			if (copy_to_user(uaddr, tmpbuf, mop->size))
 				r = -EFAULT;
@@ -4724,15 +4734,16 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_S390_MEMOP_LOGICAL_WRITE:
 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
-					    mop->size, GACC_STORE, 0);
+			r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+					    GACC_STORE, mop->key);
 			break;
 		}
 		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
 			r = -EFAULT;
 			break;
 		}
-		r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+		r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
+					 mop->size, mop->key);
 		break;
 	}
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5191b57e1562..4566f429db2c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -562,7 +562,10 @@ struct kvm_s390_mem_op {
 	__u32 op;		/* type of operation */
 	__u64 buf;		/* buffer in userspace */
 	union {
-		__u8 ar;	/* the access register number */
+		struct {
+			__u8 ar;	/* the access register number */
+			__u8 key;	/* access key, ignored if flag unset */
+		};
 		__u32 sida_offset; /* offset into the sida */
 		__u8 reserved[32]; /* should be set to 0 */
 	};
@@ -575,6 +578,7 @@ struct kvm_s390_mem_op {
 /* flags for kvm_s390_mem_op->flags */
 #define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
 #define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
+#define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
 
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
-- 
cgit v1.2.3


From ef11c9463ae006302ce170a401854a48ea0532ca Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Fri, 11 Feb 2022 19:22:11 +0100
Subject: KVM: s390: Add vm IOCTL for key checked guest absolute memory access

Channel I/O honors storage keys and is performed on absolute memory.
For I/O emulation user space therefore needs to be able to do key
checked accesses.
The vm IOCTL supports read/write accesses, as well as checking
if an access would succeed.
Unlike relying on KVM_S390_GET_SKEYS for key checking would,
the vm IOCTL performs the check in lockstep with the read or write,
by, ultimately, mapping the access to move instructions that
support key protection checking with a supplied key.
Fetch and storage protection override are not applicable to absolute
accesses and so are not applied as they are when using the vcpu memop.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Link: https://lore.kernel.org/r/20220211182215.2730017-7-scgl@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/gaccess.c  | 72 ++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/gaccess.h  |  6 ++++
 arch/s390/kvm/kvm-s390.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h |  2 ++
 4 files changed, 161 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 37838f637707..d53a183c2005 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -795,6 +795,35 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static int vm_check_access_key(struct kvm *kvm, u8 access_key,
+			       enum gacc_mode mode, gpa_t gpa)
+{
+	u8 storage_key, access_control;
+	bool fetch_protected;
+	unsigned long hva;
+	int r;
+
+	if (access_key == 0)
+		return 0;
+
+	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+	if (kvm_is_error_hva(hva))
+		return PGM_ADDRESSING;
+
+	mmap_read_lock(current->mm);
+	r = get_guest_storage_key(current->mm, hva, &storage_key);
+	mmap_read_unlock(current->mm);
+	if (r)
+		return r;
+	access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
+	if (access_control == access_key)
+		return 0;
+	fetch_protected = storage_key & _PAGE_FP_BIT;
+	if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+		return 0;
+	return PGM_PROTECTION;
+}
+
 static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode,
 					   union asce asce)
 {
@@ -994,6 +1023,26 @@ access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
 	return 0;
 }
 
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+			      unsigned long len, enum gacc_mode mode, u8 access_key)
+{
+	int offset = offset_in_page(gpa);
+	int fragment_len;
+	int rc;
+
+	while (min(PAGE_SIZE - offset, len) > 0) {
+		fragment_len = min(PAGE_SIZE - offset, len);
+		rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+		if (rc)
+			return rc;
+		offset = 0;
+		len -= fragment_len;
+		data += fragment_len;
+		gpa += fragment_len;
+	}
+	return 0;
+}
+
 int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			  void *data, unsigned long len, enum gacc_mode mode,
 			  u8 access_key)
@@ -1144,6 +1193,29 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 	return rc;
 }
 
+/**
+ * check_gpa_range - test a range of guest physical addresses for accessibility
+ * @kvm: virtual machine instance
+ * @gpa: guest physical address
+ * @length: length of test range
+ * @mode: access mode to test, relevant for storage keys
+ * @access_key: access key to mach the storage keys with
+ */
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+		    enum gacc_mode mode, u8 access_key)
+{
+	unsigned int fragment_len;
+	int rc = 0;
+
+	while (length && !rc) {
+		fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
+		rc = vm_check_access_key(kvm, access_key, mode, gpa);
+		length -= fragment_len;
+		gpa += fragment_len;
+	}
+	return rc;
+}
+
 /**
  * kvm_s390_check_low_addr_prot_real - check for low-address protection
  * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index c5f2e7311b17..1124ff282012 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -193,6 +193,12 @@ int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 		    unsigned long length, enum gacc_mode mode, u8 access_key);
 
+int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
+		    enum gacc_mode mode, u8 access_key);
+
+int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
+			      unsigned long len, enum gacc_mode mode, u8 access_key);
+
 int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			  void *data, unsigned long len, enum gacc_mode mode,
 			  u8 access_key);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c31b40abfa23..36bc73b5f5de 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2364,6 +2364,78 @@ static bool access_key_invalid(u8 access_key)
 	return access_key > 0xf;
 }
 
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+	void __user *uaddr = (void __user *)mop->buf;
+	u64 supported_flags;
+	void *tmpbuf = NULL;
+	int r, srcu_idx;
+
+	supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
+			  | KVM_S390_MEMOP_F_CHECK_ONLY;
+	if (mop->flags & ~supported_flags)
+		return -EINVAL;
+	if (mop->size > MEM_OP_MAX_SIZE)
+		return -E2BIG;
+	if (kvm_s390_pv_is_protected(kvm))
+		return -EINVAL;
+	if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
+		if (access_key_invalid(mop->key))
+			return -EINVAL;
+	} else {
+		mop->key = 0;
+	}
+	if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+		tmpbuf = vmalloc(mop->size);
+		if (!tmpbuf)
+			return -ENOMEM;
+	}
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+		r = PGM_ADDRESSING;
+		goto out_unlock;
+	}
+
+	switch (mop->op) {
+	case KVM_S390_MEMOP_ABSOLUTE_READ: {
+		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+			r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key);
+		} else {
+			r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+						      mop->size, GACC_FETCH, mop->key);
+			if (r == 0) {
+				if (copy_to_user(uaddr, tmpbuf, mop->size))
+					r = -EFAULT;
+			}
+		}
+		break;
+	}
+	case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
+		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+			r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
+		} else {
+			if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+				r = -EFAULT;
+				break;
+			}
+			r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+						      mop->size, GACC_STORE, mop->key);
+		}
+		break;
+	}
+	default:
+		r = -EINVAL;
+	}
+
+out_unlock:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	vfree(tmpbuf);
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -2488,6 +2560,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		break;
 	}
+	case KVM_S390_MEM_OP: {
+		struct kvm_s390_mem_op mem_op;
+
+		if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+			r = kvm_s390_vm_mem_op(kvm, &mem_op);
+		else
+			r = -EFAULT;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4566f429db2c..4bc7623def87 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -575,6 +575,8 @@ struct kvm_s390_mem_op {
 #define KVM_S390_MEMOP_LOGICAL_WRITE	1
 #define KVM_S390_MEMOP_SIDA_READ	2
 #define KVM_S390_MEMOP_SIDA_WRITE	3
+#define KVM_S390_MEMOP_ABSOLUTE_READ	4
+#define KVM_S390_MEMOP_ABSOLUTE_WRITE	5
 /* flags for kvm_s390_mem_op->flags */
 #define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
 #define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
-- 
cgit v1.2.3


From d004079edc166ff19605475211305923c708b4d5 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Fri, 11 Feb 2022 19:22:13 +0100
Subject: KVM: s390: Add capability for storage key extension of MEM_OP IOCTL

Availability of the KVM_CAP_S390_MEM_OP_EXTENSION capability signals that:
* The vcpu MEM_OP IOCTL supports storage key checking.
* The vm MEM_OP IOCTL exists.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Link: https://lore.kernel.org/r/20220211182215.2730017-9-scgl@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 1 +
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 773bccdd446c..c2c26c2aad64 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -564,6 +564,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_VCPU_RESETS:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_S390_DIAG318:
+	case KVM_CAP_S390_MEM_OP_EXTENSION:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4bc7623def87..08756eeea065 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1140,6 +1140,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
 #define KVM_CAP_SYS_ATTRIBUTES 209
+#define KVM_CAP_S390_MEM_OP_EXTENSION 210
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 5e35d0eb472b48ac9c8ef7017753b8a1f765aa01 Mon Sep 17 00:00:00 2001
From: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Date: Fri, 11 Feb 2022 19:22:14 +0100
Subject: KVM: s390: Update api documentation for memop ioctl

Document all currently existing operations, flags and explain under
which circumstances they are available. Document the recently
introduced absolute operations and the storage key protection flag,
as well as the existing SIDA operations.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20220211182215.2730017-10-scgl@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst | 112 +++++++++++++++++++++++++++++++++--------
 include/uapi/linux/kvm.h       |   2 +-
 2 files changed, 91 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a4267104db50..48f23bb80d7f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3683,15 +3683,17 @@ The fields in each entry are defined as follows:
 4.89 KVM_S390_MEM_OP
 --------------------
 
-:Capability: KVM_CAP_S390_MEM_OP
+:Capability: KVM_CAP_S390_MEM_OP, KVM_CAP_S390_PROTECTED, KVM_CAP_S390_MEM_OP_EXTENSION
 :Architectures: s390
-:Type: vcpu ioctl
+:Type: vm ioctl, vcpu ioctl
 :Parameters: struct kvm_s390_mem_op (in)
 :Returns: = 0 on success,
           < 0 on generic error (e.g. -EFAULT or -ENOMEM),
           > 0 if an exception occurred while walking the page tables
 
-Read or write data from/to the logical (virtual) memory of a VCPU.
+Read or write data from/to the VM's memory.
+The KVM_CAP_S390_MEM_OP_EXTENSION capability specifies what functionality is
+supported.
 
 Parameters are specified via the following structure::
 
@@ -3701,33 +3703,99 @@ Parameters are specified via the following structure::
 	__u32 size;		/* amount of bytes */
 	__u32 op;		/* type of operation */
 	__u64 buf;		/* buffer in userspace */
-	__u8 ar;		/* the access register number */
-	__u8 reserved[31];	/* should be set to 0 */
+	union {
+		struct {
+			__u8 ar;	/* the access register number */
+			__u8 key;	/* access key, ignored if flag unset */
+		};
+		__u32 sida_offset; /* offset into the sida */
+		__u8 reserved[32]; /* ignored */
+	};
   };
 
-The type of operation is specified in the "op" field. It is either
-KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or
-KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The
-KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check
-whether the corresponding memory access would create an access exception
-(without touching the data in the memory at the destination). In case an
-access exception occurred while walking the MMU tables of the guest, the
-ioctl returns a positive error number to indicate the type of exception.
-This exception is also raised directly at the corresponding VCPU if the
-flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field.
-
 The start address of the memory region has to be specified in the "gaddr"
 field, and the length of the region in the "size" field (which must not
 be 0). The maximum value for "size" can be obtained by checking the
 KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the
 userspace application where the read data should be written to for
-KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is
-stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY
-is specified, "buf" is unused and can be NULL. "ar" designates the access
-register number to be used; the valid range is 0..15.
+a read access, or where the data that should be written is stored for
+a write access.  The "reserved" field is meant for future extensions.
+Reserved and unused values are ignored. Future extension that add members must
+introduce new flags.
+
+The type of operation is specified in the "op" field. Flags modifying
+their behavior can be set in the "flags" field. Undefined flag bits must
+be set to 0.
+
+Possible operations are:
+  * ``KVM_S390_MEMOP_LOGICAL_READ``
+  * ``KVM_S390_MEMOP_LOGICAL_WRITE``
+  * ``KVM_S390_MEMOP_ABSOLUTE_READ``
+  * ``KVM_S390_MEMOP_ABSOLUTE_WRITE``
+  * ``KVM_S390_MEMOP_SIDA_READ``
+  * ``KVM_S390_MEMOP_SIDA_WRITE``
+
+Logical read/write:
+^^^^^^^^^^^^^^^^^^^
+
+Access logical memory, i.e. translate the given guest address to an absolute
+address given the state of the VCPU and use the absolute address as target of
+the access. "ar" designates the access register number to be used; the valid
+range is 0..15.
+Logical accesses are permitted for the VCPU ioctl only.
+Logical accesses are permitted for non-protected guests only.
+
+Supported flags:
+  * ``KVM_S390_MEMOP_F_CHECK_ONLY``
+  * ``KVM_S390_MEMOP_F_INJECT_EXCEPTION``
+  * ``KVM_S390_MEMOP_F_SKEY_PROTECTION``
+
+The KVM_S390_MEMOP_F_CHECK_ONLY flag can be set to check whether the
+corresponding memory access would cause an access exception; however,
+no actual access to the data in memory at the destination is performed.
+In this case, "buf" is unused and can be NULL.
+
+In case an access exception occurred during the access (or would occur
+in case of KVM_S390_MEMOP_F_CHECK_ONLY), the ioctl returns a positive
+error number indicating the type of exception. This exception is also
+raised directly at the corresponding VCPU if the flag
+KVM_S390_MEMOP_F_INJECT_EXCEPTION is set.
+
+If the KVM_S390_MEMOP_F_SKEY_PROTECTION flag is set, storage key
+protection is also in effect and may cause exceptions if accesses are
+prohibited given the access key passed in "key".
+KVM_S390_MEMOP_F_SKEY_PROTECTION is available if KVM_CAP_S390_MEM_OP_EXTENSION
+is > 0.
+
+Absolute read/write:
+^^^^^^^^^^^^^^^^^^^^
+
+Access absolute memory. This operation is intended to be used with the
+KVM_S390_MEMOP_F_SKEY_PROTECTION flag, to allow accessing memory and performing
+the checks required for storage key protection as one operation (as opposed to
+user space getting the storage keys, performing the checks, and accessing
+memory thereafter, which could lead to a delay between check and access).
+Absolute accesses are permitted for the VM ioctl if KVM_CAP_S390_MEM_OP_EXTENSION
+is > 0.
+Currently absolute accesses are not permitted for VCPU ioctls.
+Absolute accesses are permitted for non-protected guests only.
+
+Supported flags:
+  * ``KVM_S390_MEMOP_F_CHECK_ONLY``
+  * ``KVM_S390_MEMOP_F_SKEY_PROTECTION``
+
+The semantics of the flags are as for logical accesses.
+
+SIDA read/write:
+^^^^^^^^^^^^^^^^
+
+Access the secure instruction data area which contains memory operands necessary
+for instruction emulation for protected guests.
+SIDA accesses are available if the KVM_CAP_S390_PROTECTED capability is available.
+SIDA accesses are permitted for the VCPU ioctl only.
+SIDA accesses are permitted for protected guests only.
 
-The "reserved" field is meant for future extensions. It is not used by
-KVM with the currently defined set of flags.
+No flags are supported.
 
 4.90 KVM_S390_GET_SKEYS
 -----------------------
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 08756eeea065..dbc550bbd9fa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -567,7 +567,7 @@ struct kvm_s390_mem_op {
 			__u8 key;	/* access key, ignored if flag unset */
 		};
 		__u32 sida_offset; /* offset into the sida */
-		__u8 reserved[32]; /* should be set to 0 */
+		__u8 reserved[32]; /* ignored */
 	};
 };
 /* types for kvm_s390_mem_op->op */
-- 
cgit v1.2.3


From 761b9b366cec0c81a1cd80930f00611d86521d1b Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 31 Jan 2022 16:54:53 +0000
Subject: elf: Introduce the ARM MTE ELF segment type

Memory tags will be dumped in the core file as segments with their own
type. Discussions with the binutils and the generic ABI community
settled on using new definitions in the PT_*PROC space (and to be
documented in the processor-specific ABIs).

Introduce PT_ARM_MEMTAG_MTE as (PT_LOPROC + 0x1). Not included in this
patch since there is no upstream support but the CHERI/BSD community
will also reserve:

  #define PT_ARM_MEMTAG_CHERI    (PT_LOPROC + 0x2)
  #define PT_RISCV_MEMTAG_CHERI  (PT_LOPROC + 0x3)

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Luis Machado <luis.machado@linaro.org>
Link: https://lore.kernel.org/r/20220131165456.2160675-3-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/uapi/linux/elf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 61bf4774b8f2..fe8e5b74cb39 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -40,6 +40,9 @@ typedef __s64	Elf64_Sxword;
 
 #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
 
+/* ARM MTE memory tag segment type */
+#define PT_ARM_MEMTAG_MTE	(PT_LOPROC + 0x1)
+
 /*
  * Extended Numbering
  *
-- 
cgit v1.2.3


From 5cd5a8a3e2fb11b1c8a09f062c44c1e228ef987a Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:53 +0100
Subject: cfg80211: Add data structures to capture EHT capabilities

And advertise EHT capabilities to user space when supported.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.6fb70658529f.I2413a37c8f7d2d6d638038a3d95360a3fce0114d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 63 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/nl80211.h | 12 +++++++++
 net/wireless/nl80211.c       | 27 +++++++++++++++++++
 3 files changed, 102 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f6db085ff3df..7bec15f605be 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -360,6 +360,48 @@ struct ieee80211_sta_he_cap {
 	u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
 };
 
+/**
+ * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS
+ *
+ * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS
+ * and NSS Set field"
+ *
+ * @only_20mhz: MCS/NSS support for 20 MHz-only STA.
+ * @bw._80: MCS/NSS support for BW <= 80 MHz
+ * @bw._160: MCS/NSS support for BW = 160 MHz
+ * @bw._320: MCS/NSS support for BW = 320 MHz
+ */
+struct ieee80211_eht_mcs_nss_supp {
+	union {
+		struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz;
+		struct {
+			struct ieee80211_eht_mcs_nss_supp_bw _80;
+			struct ieee80211_eht_mcs_nss_supp_bw _160;
+			struct ieee80211_eht_mcs_nss_supp_bw _320;
+		} __packed bw;
+	} __packed;
+} __packed;
+
+#define IEEE80211_EHT_PPE_THRES_MAX_LEN		32
+
+/**
+ * struct ieee80211_sta_eht_cap - STA's EHT capabilities
+ *
+ * This structure describes most essential parameters needed
+ * to describe 802.11be EHT capabilities for a STA.
+ *
+ * @has_eht: true iff EHT data is valid.
+ * @eht_cap_elem: Fixed portion of the eht capabilities element.
+ * @eht_mcs_nss_supp: The supported NSS/MCS combinations.
+ * @eht_ppe_thres: Holds the PPE Thresholds data.
+ */
+struct ieee80211_sta_eht_cap {
+	bool has_eht;
+	struct ieee80211_eht_cap_elem_fixed eht_cap_elem;
+	struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp;
+	u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
+};
+
 /**
  * struct ieee80211_sband_iftype_data - sband data per interface type
  *
@@ -379,6 +421,7 @@ struct ieee80211_sband_iftype_data {
 	u16 types_mask;
 	struct ieee80211_sta_he_cap he_cap;
 	struct ieee80211_he_6ghz_capa he_6ghz_capa;
+	struct ieee80211_sta_eht_cap eht_cap;
 	struct {
 		const u8 *data;
 		unsigned int len;
@@ -561,6 +604,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
 	return data->he_6ghz_capa.capa;
 }
 
+/**
+ * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
+ * @sband: the sband to search for the iftype on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_eht_cap *
+ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
+			     enum nl80211_iftype iftype)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, iftype);
+
+	if (data && data->eht_cap.has_eht)
+		return &data->eht_cap;
+
+	return NULL;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 195a238a322e..d305a8b8c536 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3766,6 +3766,14 @@ enum nl80211_mpath_info {
  *	given for all 6 GHz band channels
  * @NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS: vendor element capabilities that are
  *	advertised on this band/for this iftype (binary)
+ * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC: EHT MAC capabilities as in EHT
+ *	capabilities element
+ * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY: EHT PHY capabilities as in EHT
+ *	capabilities element
+ * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET: EHT supported NSS/MCS as in EHT
+ *	capabilities element
+ * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as
+ *	defined in EHT capabilities element
  * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use
  * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined
  */
@@ -3779,6 +3787,10 @@ enum nl80211_band_iftype_attr {
 	NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
 	NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
 	NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS,
+	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC,
+	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY,
+	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET,
+	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE,
 
 	/* keep last */
 	__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5c49d7c4bcae..2dfd45f35486 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1737,6 +1737,7 @@ nl80211_send_iftype_data(struct sk_buff *msg,
 			 const struct ieee80211_sband_iftype_data *iftdata)
 {
 	const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
+	const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap;
 
 	if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
 				iftdata->types_mask))
@@ -1757,6 +1758,32 @@ nl80211_send_iftype_data(struct sk_buff *msg,
 			return -ENOBUFS;
 	}
 
+	if (eht_cap->has_eht && he_cap->has_he) {
+		u8 mcs_nss_size, ppe_thresh_size;
+		u16 ppe_thres_hdr;
+
+		mcs_nss_size =
+			ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
+						   &eht_cap->eht_cap_elem);
+
+		ppe_thres_hdr = get_unaligned_le16(&eht_cap->eht_ppe_thres[0]);
+		ppe_thresh_size =
+			ieee80211_eht_ppe_size(ppe_thres_hdr,
+					       eht_cap->eht_cap_elem.phy_cap_info);
+
+		if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC,
+			    sizeof(eht_cap->eht_cap_elem.mac_cap_info),
+			    eht_cap->eht_cap_elem.mac_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY,
+			    sizeof(eht_cap->eht_cap_elem.phy_cap_info),
+			    eht_cap->eht_cap_elem.phy_cap_info) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET,
+			    mcs_nss_size, &eht_cap->eht_mcs_nss_supp) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE,
+			    ppe_thresh_size, eht_cap->eht_ppe_thres))
+			return -ENOBUFS;
+	}
+
 	if (sband->band == NL80211_BAND_6GHZ &&
 	    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
 		    sizeof(iftdata->he_6ghz_capa),
-- 
cgit v1.2.3


From 3743bec6120ae0748cb3fda6ff80a690117ef1f3 Mon Sep 17 00:00:00 2001
From: Jia Ding <quic_jiad@quicinc.com>
Date: Mon, 14 Feb 2022 17:29:54 +0100
Subject: cfg80211: Add support for EHT 320 MHz channel width

Add 320 MHz support in the channel def and center frequency validation
with compatible check.

Signed-off-by: Jia Ding <quic_jiad@quicinc.com>
Co-authored-by: Karthikeyan Periyasamy <quic_periyasa@quicinc.com>
Signed-off-by: Karthikeyan Periyasamy <quic_periyasa@quicinc.com>
Co-authored-by: Muna Sinada <quic_msinada@quicinc.com>
Signed-off-by: Muna Sinada <quic_msinada@quicinc.com>
Co-authored-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1640163883-12696-5-git-send-email-quic_vjakkam@quicinc.com
Link: https://lore.kernel.org/r/20220214163009.175289-1-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c |  2 +
 include/net/cfg80211.h                |  7 ++-
 include/uapi/linux/nl80211.h          |  3 ++
 net/wireless/chan.c                   | 91 ++++++++++++++++++++++++++++++++---
 net/wireless/nl80211.c                |  2 +
 5 files changed, 97 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index defb7e36c1f2..0054c234fb9a 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -836,6 +836,8 @@ static int hwsim_get_chanwidth(enum nl80211_chan_width bw)
 	case NL80211_CHAN_WIDTH_80P80:
 	case NL80211_CHAN_WIDTH_160:
 		return 160;
+	case NL80211_CHAN_WIDTH_320:
+		return 320;
 	case NL80211_CHAN_WIDTH_5:
 		return 5;
 	case NL80211_CHAN_WIDTH_10:
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7bec15f605be..a27f2d699dac 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -109,7 +109,11 @@ struct wiphy;
  *	on this channel.
  * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted
  *	on this channel.
- *
+ * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band,
+ *	this flag indicates that a 320 MHz channel cannot use this
+ *	channel as the control or any of the secondary channels.
+ *	This may be due to the driver or due to regulatory bandwidth
+ *	restrictions.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -131,6 +135,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_4MHZ		= 1<<16,
 	IEEE80211_CHAN_8MHZ		= 1<<17,
 	IEEE80211_CHAN_16MHZ		= 1<<18,
+	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d305a8b8c536..9e05973f3f56 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4684,6 +4684,8 @@ enum nl80211_key_mode {
  * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel
  * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel
  * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel
+ * @NL80211_CHAN_WIDTH_320: 320 MHz channel, the %NL80211_ATTR_CENTER_FREQ1
+ *	attribute must be provided as well
  */
 enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_20_NOHT,
@@ -4699,6 +4701,7 @@ enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_4,
 	NL80211_CHAN_WIDTH_8,
 	NL80211_CHAN_WIDTH_16,
+	NL80211_CHAN_WIDTH_320,
 };
 
 /**
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index eb822052d344..8b7fb4a9e07b 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -181,6 +181,9 @@ static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width)
 	case NL80211_CHAN_WIDTH_160:
 		mhz = 160;
 		break;
+	case NL80211_CHAN_WIDTH_320:
+		mhz = 320;
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		return -1;
@@ -271,6 +274,17 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 	case NL80211_CHAN_WIDTH_16:
 		/* all checked above */
 		break;
+	case NL80211_CHAN_WIDTH_320:
+		if (chandef->center_freq1 == control_freq + 150 ||
+		    chandef->center_freq1 == control_freq + 130 ||
+		    chandef->center_freq1 == control_freq + 110 ||
+		    chandef->center_freq1 == control_freq + 90 ||
+		    chandef->center_freq1 == control_freq - 90 ||
+		    chandef->center_freq1 == control_freq - 110 ||
+		    chandef->center_freq1 == control_freq - 130 ||
+		    chandef->center_freq1 == control_freq - 150)
+			break;
+		fallthrough;
 	case NL80211_CHAN_WIDTH_160:
 		if (chandef->center_freq1 == control_freq + 70 ||
 		    chandef->center_freq1 == control_freq + 50 ||
@@ -307,7 +321,7 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 EXPORT_SYMBOL(cfg80211_chandef_valid);
 
 static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
-				  u32 *pri40, u32 *pri80)
+				  u32 *pri40, u32 *pri80, u32 *pri160)
 {
 	int tmp;
 
@@ -315,9 +329,11 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
 	case NL80211_CHAN_WIDTH_40:
 		*pri40 = c->center_freq1;
 		*pri80 = 0;
+		*pri160 = 0;
 		break;
 	case NL80211_CHAN_WIDTH_80:
 	case NL80211_CHAN_WIDTH_80P80:
+		*pri160 = 0;
 		*pri80 = c->center_freq1;
 		/* n_P20 */
 		tmp = (30 + c->chan->center_freq - c->center_freq1)/20;
@@ -327,6 +343,7 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
 		*pri40 = c->center_freq1 - 20 + 40 * tmp;
 		break;
 	case NL80211_CHAN_WIDTH_160:
+		*pri160 = c->center_freq1;
 		/* n_P20 */
 		tmp = (70 + c->chan->center_freq - c->center_freq1)/20;
 		/* n_P40 */
@@ -337,6 +354,20 @@ static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
 		tmp /= 2;
 		*pri80 = c->center_freq1 - 40 + 80 * tmp;
 		break;
+	case NL80211_CHAN_WIDTH_320:
+		/* n_P20 */
+		tmp = (150 + c->chan->center_freq - c->center_freq1) / 20;
+		/* n_P40 */
+		tmp /= 2;
+		/* freq_P40 */
+		*pri40 = c->center_freq1 - 140 + 40 * tmp;
+		/* n_P80 */
+		tmp /= 2;
+		*pri80 = c->center_freq1 - 120 + 80 * tmp;
+		/* n_P160 */
+		tmp /= 2;
+		*pri160 = c->center_freq1 - 80 + 160 * tmp;
+		break;
 	default:
 		WARN_ON_ONCE(1);
 	}
@@ -346,7 +377,7 @@ const struct cfg80211_chan_def *
 cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
 			    const struct cfg80211_chan_def *c2)
 {
-	u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80;
+	u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80, c1_pri160, c2_pri160;
 
 	/* If they are identical, return */
 	if (cfg80211_chandef_identical(c1, c2))
@@ -381,14 +412,31 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
 	    c2->width == NL80211_CHAN_WIDTH_20)
 		return c1;
 
-	chandef_primary_freqs(c1, &c1_pri40, &c1_pri80);
-	chandef_primary_freqs(c2, &c2_pri40, &c2_pri80);
+	chandef_primary_freqs(c1, &c1_pri40, &c1_pri80, &c1_pri160);
+	chandef_primary_freqs(c2, &c2_pri40, &c2_pri80, &c2_pri160);
 
 	if (c1_pri40 != c2_pri40)
 		return NULL;
 
-	WARN_ON(!c1_pri80 && !c2_pri80);
-	if (c1_pri80 && c2_pri80 && c1_pri80 != c2_pri80)
+	if (c1->width == NL80211_CHAN_WIDTH_40)
+		return c2;
+
+	if (c2->width == NL80211_CHAN_WIDTH_40)
+		return c1;
+
+	if (c1_pri80 != c2_pri80)
+		return NULL;
+
+	if (c1->width == NL80211_CHAN_WIDTH_80 &&
+	    c2->width > NL80211_CHAN_WIDTH_80)
+		return c2;
+
+	if (c2->width == NL80211_CHAN_WIDTH_80 &&
+	    c1->width > NL80211_CHAN_WIDTH_80)
+		return c1;
+
+	WARN_ON(!c1_pri160 && !c2_pri160);
+	if (c1_pri160 && c2_pri160 && c1_pri160 != c2_pri160)
 		return NULL;
 
 	if (c1->width > c2->width)
@@ -960,7 +1008,10 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 	struct ieee80211_sta_vht_cap *vht_cap;
 	struct ieee80211_edmg *edmg_cap;
 	u32 width, control_freq, cap;
-	bool ext_nss_cap, support_80_80 = false;
+	bool ext_nss_cap, support_80_80 = false, support_320 = false;
+	const struct ieee80211_sband_iftype_data *iftd;
+	struct ieee80211_supported_band *sband;
+	int i;
 
 	if (WARN_ON(!cfg80211_chandef_valid(chandef)))
 		return false;
@@ -1062,6 +1113,32 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 		      (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)))
 			return false;
 		break;
+	case NL80211_CHAN_WIDTH_320:
+		prohibited_flags |= IEEE80211_CHAN_NO_320MHZ;
+		width = 320;
+
+		if (chandef->chan->band != NL80211_BAND_6GHZ)
+			return false;
+
+		sband = wiphy->bands[NL80211_BAND_6GHZ];
+		if (!sband)
+			return false;
+
+		for (i = 0; i < sband->n_iftype_data; i++) {
+			iftd = &sband->iftype_data[i];
+			if (!iftd->eht_cap.has_eht)
+				continue;
+
+			if (iftd->eht_cap.eht_cap_elem.phy_cap_info[0] &
+			    IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) {
+				support_320 = true;
+				break;
+			}
+		}
+
+		if (!support_320)
+			return false;
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		return false;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2dfd45f35486..f93b9686fb41 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10589,6 +10589,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 					     NL80211_EXT_FEATURE_VHT_IBSS))
 			return -EINVAL;
 		break;
+	case NL80211_CHAN_WIDTH_320:
+		return -EINVAL;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From cfb14110acf87b4db62e07ba08a80429f1749f40 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Mon, 14 Feb 2022 17:29:55 +0100
Subject: nl80211: add EHT MCS support

Add support for reporting and calculating EHT bitrates.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1640163883-12696-7-git-send-email-quic_vjakkam@quicinc.com
Link: https://lore.kernel.org/r/20220214163009.175289-2-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  11 ++++
 include/uapi/linux/nl80211.h |  62 ++++++++++++++++++++
 net/wireless/nl80211.c       |  19 +++++++
 net/wireless/util.c          | 131 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a27f2d699dac..f35ffd81d213 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1595,6 +1595,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
  * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
  * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
+ * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
@@ -1604,6 +1605,7 @@ enum rate_info_flags {
 	RATE_INFO_FLAGS_HE_MCS			= BIT(4),
 	RATE_INFO_FLAGS_EDMG			= BIT(5),
 	RATE_INFO_FLAGS_EXTENDED_SC_DMG		= BIT(6),
+	RATE_INFO_FLAGS_EHT_MCS			= BIT(7),
 };
 
 /**
@@ -1618,6 +1620,8 @@ enum rate_info_flags {
  * @RATE_INFO_BW_80: 80 MHz bandwidth
  * @RATE_INFO_BW_160: 160 MHz bandwidth
  * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
+ * @RATE_INFO_BW_320: 320 MHz bandwidth
+ * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
  */
 enum rate_info_bw {
 	RATE_INFO_BW_20 = 0,
@@ -1627,6 +1631,8 @@ enum rate_info_bw {
 	RATE_INFO_BW_80,
 	RATE_INFO_BW_160,
 	RATE_INFO_BW_HE_RU,
+	RATE_INFO_BW_320,
+	RATE_INFO_BW_EHT_RU,
 };
 
 /**
@@ -1644,6 +1650,9 @@ enum rate_info_bw {
  * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
  *	only valid if bw is %RATE_INFO_BW_HE_RU)
  * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
+ * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi)
+ * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc,
+ *	only valid if bw is %RATE_INFO_BW_EHT_RU)
  */
 struct rate_info {
 	u8 flags;
@@ -1655,6 +1664,8 @@ struct rate_info {
 	u8 he_dcm;
 	u8 he_ru_alloc;
 	u8 n_bonded_ch;
+	u8 eht_gi;
+	u8 eht_ru_alloc;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9e05973f3f56..d0ba70ea5d04 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3392,6 +3392,56 @@ enum nl80211_he_ru_alloc {
 	NL80211_RATE_INFO_HE_RU_ALLOC_2x996,
 };
 
+/**
+ * enum nl80211_eht_gi - EHT guard interval
+ * @NL80211_RATE_INFO_EHT_GI_0_8: 0.8 usec
+ * @NL80211_RATE_INFO_EHT_GI_1_6: 1.6 usec
+ * @NL80211_RATE_INFO_EHT_GI_3_2: 3.2 usec
+ */
+enum nl80211_eht_gi {
+	NL80211_RATE_INFO_EHT_GI_0_8,
+	NL80211_RATE_INFO_EHT_GI_1_6,
+	NL80211_RATE_INFO_EHT_GI_3_2,
+};
+
+/**
+ * enum nl80211_eht_ru_alloc - EHT RU allocation values
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_52: 52-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_52P26: 52+26-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_106: 106-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_106P26: 106+26 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_242: 242-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_484: 484-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_484P242: 484+242 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_996: 996-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484: 996+484 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242: 996+484+242 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996: 2x996-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484: 2x996+484 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996: 3x996-tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484: 3x996+484 tone RU allocation
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC_4x996: 4x996-tone RU allocation
+ */
+enum nl80211_eht_ru_alloc {
+	NL80211_RATE_INFO_EHT_RU_ALLOC_26,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_52,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_52P26,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_106,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_106P26,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_242,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_484,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_484P242,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_996,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_996P484,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_2x996,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_3x996,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484,
+	NL80211_RATE_INFO_EHT_RU_ALLOC_4x996,
+};
+
 /**
  * enum nl80211_rate_info - bitrate information
  *
@@ -3431,6 +3481,13 @@ enum nl80211_he_ru_alloc {
  * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1)
  * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then
  *	non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc)
+ * @NL80211_RATE_INFO_320_MHZ_WIDTH: 320 MHz bitrate
+ * @NL80211_RATE_INFO_EHT_MCS: EHT MCS index (u8, 0-15)
+ * @NL80211_RATE_INFO_EHT_NSS: EHT NSS value (u8, 1-8)
+ * @NL80211_RATE_INFO_EHT_GI: EHT guard interval identifier
+ *	(u8, see &enum nl80211_eht_gi)
+ * @NL80211_RATE_INFO_EHT_RU_ALLOC: EHT RU allocation, if not present then
+ *	non-OFDMA was used (u8, see &enum nl80211_eht_ru_alloc)
  * @__NL80211_RATE_INFO_AFTER_LAST: internal use
  */
 enum nl80211_rate_info {
@@ -3452,6 +3509,11 @@ enum nl80211_rate_info {
 	NL80211_RATE_INFO_HE_GI,
 	NL80211_RATE_INFO_HE_DCM,
 	NL80211_RATE_INFO_HE_RU_ALLOC,
+	NL80211_RATE_INFO_320_MHZ_WIDTH,
+	NL80211_RATE_INFO_EHT_MCS,
+	NL80211_RATE_INFO_EHT_NSS,
+	NL80211_RATE_INFO_EHT_GI,
+	NL80211_RATE_INFO_EHT_RU_ALLOC,
 
 	/* keep last */
 	__NL80211_RATE_INFO_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f93b9686fb41..a267e838f79c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5954,6 +5954,14 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
 	case RATE_INFO_BW_HE_RU:
 		rate_flg = 0;
 		WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
+		break;
+	case RATE_INFO_BW_320:
+		rate_flg = NL80211_RATE_INFO_320_MHZ_WIDTH;
+		break;
+	case RATE_INFO_BW_EHT_RU:
+		rate_flg = 0;
+		WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS));
+		break;
 	}
 
 	if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -5986,6 +5994,17 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
 		    nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
 			       info->he_ru_alloc))
 			return false;
+	} else if (info->flags & RATE_INFO_FLAGS_EHT_MCS) {
+		if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_MCS, info->mcs))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi))
+			return false;
+		if (info->bw == RATE_INFO_BW_EHT_RU &&
+		    nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
+			       info->eht_ru_alloc))
+			return false;
 	}
 
 	nla_nest_end(msg, rate);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e02f1702806e..2eda097aee7f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1430,6 +1430,135 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
 	return result / 10000;
 }
 
+static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
+{
+#define SCALE 6144
+	static const u32 mcs_divisors[16] = {
+		102399, /* 16.666666... */
+		 51201, /*  8.333333... */
+		 34134, /*  5.555555... */
+		 25599, /*  4.166666... */
+		 17067, /*  2.777777... */
+		 12801, /*  2.083333... */
+		 11769, /*  1.851851... */
+		 10239, /*  1.666666... */
+		  8532, /*  1.388888... */
+		  7680, /*  1.250000... */
+		  6828, /*  1.111111... */
+		  6144, /*  1.000000... */
+		  5690, /*  0.926106... */
+		  5120, /*  0.833333... */
+		409600, /* 66.666666... */
+		204800, /* 33.333333... */
+	};
+	static const u32 rates_996[3] =  { 480388888, 453700000, 408333333 };
+	static const u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
+	static const u32 rates_242[3] =  { 114711111, 108333333,  97500000 };
+	static const u32 rates_106[3] =  {  40000000,  37777777,  34000000 };
+	static const u32 rates_52[3]  =  {  18820000,  17777777,  16000000 };
+	static const u32 rates_26[3]  =  {   9411111,   8888888,   8000000 };
+	u64 tmp;
+	u32 result;
+
+	if (WARN_ON_ONCE(rate->mcs > 15))
+		return 0;
+	if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2))
+		return 0;
+	if (WARN_ON_ONCE(rate->eht_ru_alloc >
+			 NL80211_RATE_INFO_EHT_RU_ALLOC_4x996))
+		return 0;
+	if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
+		return 0;
+
+	/* Bandwidth checks for MCS 14 */
+	if (rate->mcs == 14) {
+		if ((rate->bw != RATE_INFO_BW_EHT_RU &&
+		     rate->bw != RATE_INFO_BW_80 &&
+		     rate->bw != RATE_INFO_BW_160 &&
+		     rate->bw != RATE_INFO_BW_320) ||
+		    (rate->bw == RATE_INFO_BW_EHT_RU &&
+		     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 &&
+		     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 &&
+		     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) {
+			WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n",
+			     rate->bw, rate->eht_ru_alloc);
+			return 0;
+		}
+	}
+
+	if (rate->bw == RATE_INFO_BW_320 ||
+	    (rate->bw == RATE_INFO_BW_EHT_RU &&
+	     rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996))
+		result = 4 * rates_996[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484)
+		result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996)
+		result = 3 * rates_996[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484)
+		result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_160 ||
+		 (rate->bw == RATE_INFO_BW_EHT_RU &&
+		  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996))
+		result = 2 * rates_996[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc ==
+		 NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242)
+		result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]
+			 + rates_242[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484)
+		result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_80 ||
+		 (rate->bw == RATE_INFO_BW_EHT_RU &&
+		  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996))
+		result = rates_996[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242)
+		result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_40 ||
+		 (rate->bw == RATE_INFO_BW_EHT_RU &&
+		  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484))
+		result = rates_484[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_20 ||
+		 (rate->bw == RATE_INFO_BW_EHT_RU &&
+		  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242))
+		result = rates_242[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26)
+		result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106)
+		result = rates_106[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26)
+		result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52)
+		result = rates_52[rate->eht_gi];
+	else if (rate->bw == RATE_INFO_BW_EHT_RU &&
+		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26)
+		result = rates_26[rate->eht_gi];
+	else {
+		WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n",
+		     rate->bw, rate->eht_ru_alloc);
+		return 0;
+	}
+
+	/* now scale to the appropriate MCS */
+	tmp = result;
+	tmp *= SCALE;
+	do_div(tmp, mcs_divisors[rate->mcs]);
+	result = tmp;
+
+	/* and take NSS */
+	result = (result * rate->nss) / 8;
+
+	return result / 10000;
+}
+
 u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 {
 	if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1444,6 +1573,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 		return cfg80211_calculate_bitrate_vht(rate);
 	if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
 		return cfg80211_calculate_bitrate_he(rate);
+	if (rate->flags & RATE_INFO_FLAGS_EHT_MCS)
+		return cfg80211_calculate_bitrate_eht(rate);
 
 	return rate->legacy;
 }
-- 
cgit v1.2.3


From c2b3d7699fb0ce66538b829af43970acc2f89060 Mon Sep 17 00:00:00 2001
From: Sriram R <quic_srirrama@quicinc.com>
Date: Mon, 14 Feb 2022 17:29:56 +0100
Subject: nl80211: add support for 320MHz channel limitation

Add support to advertise drivers or regulatory limitations on 320 MHz
channels to userspace.

Signed-off-by: Sriram R <quic_srirrama@quicinc.com>
Co-authored-by: Karthikeyan Periyasamy <quic_periyasia@quicinc.com>
Signed-off-by: Karthikeyan Periyasamy <quic_periyasia@quicinc.com>
Co-authored-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1640163883-12696-6-git-send-email-quic_vjakkam@quicinc.com
Link: https://lore.kernel.org/r/20220214163009.175289-3-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 5 +++++
 net/wireless/nl80211.c       | 3 +++
 net/wireless/reg.c           | 6 ++++++
 3 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d0ba70ea5d04..6a338dafcd07 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3997,6 +3997,8 @@ enum nl80211_wmm_rule {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed
  *	on this channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel
+ *	as the primary or any of the secondary channels isn't possible
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4033,6 +4035,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_4MHZ,
 	NL80211_FREQUENCY_ATTR_8MHZ,
 	NL80211_FREQUENCY_ATTR_16MHZ,
+	NL80211_FREQUENCY_ATTR_NO_320MHZ,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4231,6 +4234,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed
  * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed
  * @NL80211_RRF_NO_HE: HE operation not allowed
+ * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed
  */
 enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_OFDM		= 1<<0,
@@ -4249,6 +4253,7 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_80MHZ		= 1<<15,
 	NL80211_RRF_NO_160MHZ		= 1<<16,
 	NL80211_RRF_NO_HE		= 1<<17,
+	NL80211_RRF_NO_320MHZ		= 1<<18,
 };
 
 #define NL80211_RRF_PASSIVE_SCAN	NL80211_RRF_NO_IR
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a267e838f79c..7edf0bb4e90e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1156,6 +1156,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_16MHZ) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ))
 			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ))
+			goto nla_put_failure;
 	}
 
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index ec25924a1c26..c76cd973f06e 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1238,6 +1238,8 @@ unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd,
 {
 	unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule);
 
+	if (rule->flags & NL80211_RRF_NO_320MHZ)
+		bw = min_t(unsigned int, bw, MHZ_TO_KHZ(160));
 	if (rule->flags & NL80211_RRF_NO_160MHZ)
 		bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80));
 	if (rule->flags & NL80211_RRF_NO_80MHZ)
@@ -1611,6 +1613,8 @@ static u32 map_regdom_flags(u32 rd_flags)
 		channel_flags |= IEEE80211_CHAN_NO_160MHZ;
 	if (rd_flags & NL80211_RRF_NO_HE)
 		channel_flags |= IEEE80211_CHAN_NO_HE;
+	if (rd_flags & NL80211_RRF_NO_320MHZ)
+		channel_flags |= IEEE80211_CHAN_NO_320MHZ;
 	return channel_flags;
 }
 
@@ -1773,6 +1777,8 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
 			bw_flags |= IEEE80211_CHAN_NO_80MHZ;
 		if (max_bandwidth_khz < MHZ_TO_KHZ(160))
 			bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+		if (max_bandwidth_khz < MHZ_TO_KHZ(320))
+			bw_flags |= IEEE80211_CHAN_NO_320MHZ;
 	}
 	return bw_flags;
 }
-- 
cgit v1.2.3


From 31846b657857e6a73d982604f36a34710d98902c Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:57 +0100
Subject: cfg80211: add NO-EHT flag to regulatory

This may be necessary in some cases, add a flag and propagate
it, just like the NO-HE that already exists.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
[split off from a combined 320/no-EHT patch]
Link: https://lore.kernel.org/r/20220214173004.dbb85a7b86bb.Ifc1e2daac51c1cc5f895ccfb79faf5eaec3950ec@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/nl80211.c       | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f35ffd81d213..5cfc483dece1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -114,6 +114,7 @@ struct wiphy;
  *	channel as the control or any of the secondary channels.
  *	This may be due to the driver or due to regulatory bandwidth
  *	restrictions.
+ * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -136,6 +137,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_8MHZ		= 1<<17,
 	IEEE80211_CHAN_16MHZ		= 1<<18,
 	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
+	IEEE80211_CHAN_NO_EHT		= 1<<20,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 6a338dafcd07..baf6433c0119 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3999,6 +3999,8 @@ enum nl80211_wmm_rule {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel
  *	as the primary or any of the secondary channels isn't possible
+ * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel
+ *	in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4036,6 +4038,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_8MHZ,
 	NL80211_FREQUENCY_ATTR_16MHZ,
 	NL80211_FREQUENCY_ATTR_NO_320MHZ,
+	NL80211_FREQUENCY_ATTR_NO_EHT,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7edf0bb4e90e..3e6809f24220 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1159,6 +1159,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ))
 			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_NO_EHT) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_EHT))
+			goto nla_put_failure;
 	}
 
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
-- 
cgit v1.2.3


From ea05fd3581d32a0f1098657005c7a9b763798fe8 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:58 +0100
Subject: cfg80211: Support configuration of station EHT capabilities

Add attributes and some code bits to support userspace passing
in EHT capabilities of stations.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.ecf0b3ff9627.Icb4a5f2ec7b41d9008ac4cfc16c59baeb84793d3@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 10 +++++++++-
 net/wireless/nl80211.c       | 37 +++++++++++++++++++++++++++++++++----
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5cfc483dece1..68713388b617 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1487,6 +1487,8 @@ struct sta_txpwr {
  * @airtime_weight: airtime scheduler weight for this station
  * @txpwr: transmit power for an associated station
  * @he_6ghz_capa: HE 6 GHz Band capabilities of station
+ * @eht_capa: EHT capabilities of station
+ * @eht_capa_len: the length of the EHT capabilities
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -1520,6 +1522,8 @@ struct station_parameters {
 	u16 airtime_weight;
 	struct sta_txpwr txpwr;
 	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
+	const struct ieee80211_eht_cap_elem *eht_capa;
+	u8 eht_capa_len;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index baf6433c0119..98ed52663d6b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,7 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -2659,6 +2659,10 @@ enum nl80211_commands {
  *	enumerated in &enum nl80211_ap_settings_flags. This attribute shall be
  *	used with %NL80211_CMD_START_AP request.
  *
+ * @NL80211_ATTR_EHT_CAPABILITY: EHT Capability information element (from
+ *	association request when used with NL80211_CMD_NEW_STATION). Can be set
+ *	only if %NL80211_STA_FLAG_WME is set.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3169,6 +3173,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_AP_SETTINGS_FLAGS,
 
+	NL80211_ATTR_EHT_CAPABILITY,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3224,6 +3230,8 @@ enum nl80211_attrs {
 #define NL80211_HE_MAX_CAPABILITY_LEN           54
 #define NL80211_MAX_NR_CIPHER_SUITES		5
 #define NL80211_MAX_NR_AKM_SUITES		2
+#define NL80211_EHT_MIN_CAPABILITY_LEN          13
+#define NL80211_EHT_MAX_CAPABILITY_LEN          51
 
 #define NL80211_MIN_REMAIN_ON_CHANNEL_TIME	10
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3e6809f24220..7543c73a3f1d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 
 #include <linux/if.h>
@@ -786,6 +786,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MBSSID_ELEMS] = { .type = NLA_NESTED },
 	[NL80211_ATTR_RADAR_BACKGROUND] = { .type = NLA_FLAG },
 	[NL80211_ATTR_AP_SETTINGS_FLAGS] = { .type = NLA_U32 },
+	[NL80211_ATTR_EHT_CAPABILITY] =
+		NLA_POLICY_RANGE(NLA_BINARY,
+				 NL80211_EHT_MIN_CAPABILITY_LEN,
+				 NL80211_EHT_MAX_CAPABILITY_LEN),
 };
 
 /* policy for the key attributes */
@@ -6425,7 +6429,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 		if (params->supported_rates)
 			return -EINVAL;
 		if (params->ext_capab || params->ht_capa || params->vht_capa ||
-		    params->he_capa)
+		    params->he_capa || params->eht_capa)
 			return -EINVAL;
 	}
 
@@ -6628,6 +6632,18 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
 		params->he_capa_len =
 			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
+			params->eht_capa =
+				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+			params->eht_capa_len =
+				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params->he_capa,
+							(const u8 *)params->eht_capa,
+							params->eht_capa_len))
+				return -EINVAL;
+		}
 	}
 
 	err = nl80211_parse_sta_channel_info(info, params);
@@ -6885,6 +6901,18 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
 		params.he_capa_len =
 			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
+			params.eht_capa =
+				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+			params.eht_capa_len =
+				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa,
+							(const u8 *)params.eht_capa,
+							params.eht_capa_len))
+				return -EINVAL;
+		}
 	}
 
 	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
@@ -6934,8 +6962,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.ht_capa = NULL;
 		params.vht_capa = NULL;
 
-		/* HE requires WME */
-		if (params.he_capa_len || params.he_6ghz_capa)
+		/* HE and EHT require WME */
+		if (params.he_capa_len || params.he_6ghz_capa ||
+		    params.eht_capa_len)
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 79aa0367385ceaf5351ea77ea1fb66136739ea9d Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 15 Feb 2022 18:54:38 -0500
Subject: drm/amdkfd: Replace zero-length array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reference:
https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays

CC: Changcheng Deng <deng.changcheng@zte.com.cn>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 6e4268f5e482..baec5a41de3e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -673,7 +673,7 @@ struct kfd_ioctl_svm_args {
 	__u32 op;
 	__u32 nattr;
 	/* Variable length array of attributes */
-	struct kfd_ioctl_svm_attribute attrs[0];
+	struct kfd_ioctl_svm_attribute attrs[];
 };
 
 /**
-- 
cgit v1.2.3


From 169adc2b6b3c5e86391921117b4ab3aaeb3c6ee1 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 10 Feb 2022 11:11:26 +0900
Subject: android/binder.h: add linux/android/binder(fs).h to UAPI compile-test
 coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

linux/android/binder.h and linux/android/binderfs.h are currently
excluded from the UAPI compile-test because of the errors like follows:

    HDRTEST usr/include/linux/android/binder.h
  In file included from <command-line>:
  ./usr/include/linux/android/binder.h:291:9: error: unknown type name ‘pid_t’
    291 |         pid_t           sender_pid;
        |         ^~~~~
  ./usr/include/linux/android/binder.h:292:9: error: unknown type name ‘uid_t’
    292 |         uid_t           sender_euid;
        |         ^~~~~

The errors can be fixed by replacing {pid,uid}_t with __kernel_{pid,uid}_t.

Then, remove the no-header-test entries from user/include/Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/android/binder.h | 4 ++--
 usr/include/Makefile                | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 3246f2c74696..11157fae8a8e 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -288,8 +288,8 @@ struct binder_transaction_data {
 
 	/* General information about the transaction. */
 	__u32	        flags;
-	pid_t		sender_pid;
-	uid_t		sender_euid;
+	__kernel_pid_t	sender_pid;
+	__kernel_uid_t	sender_euid;
 	binder_size_t	data_size;	/* number of bytes of data */
 	binder_size_t	offsets_size;	/* number of bytes of offsets */
 
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 1aa725a3cbbc..0872877c9457 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -23,8 +23,6 @@ override c_flags = $(UAPI_CFLAGS) -Wp,-MMD,$(depfile) -I$(objtree)/usr/include
 no-header-test += asm/ucontext.h
 no-header-test += drm/vmwgfx_drm.h
 no-header-test += linux/am437x-vpfe.h
-no-header-test += linux/android/binder.h
-no-header-test += linux/android/binderfs.h
 no-header-test += linux/coda.h
 no-header-test += linux/cyclades.h
 no-header-test += linux/errqueue.h
-- 
cgit v1.2.3


From cbf2820341297b9aed0f846aba35556e94569210 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 10 Feb 2022 11:11:27 +0900
Subject: fsmap.h: add linux/fsmap.h to UAPI compile-test coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

linux/fsmap.h is currently excluded from the UAPI compile-test because
of the error like follows:

    HDRTEST usr/include/linux/fsmap.h
  In file included from <command-line>:
  ./usr/include/linux/fsmap.h:72:19: error: unknown type name ‘size_t’
     72 | static __inline__ size_t
        |                   ^~~~~~

The error can be fixed by replacing size_t with __kernel_size_t.

Then, remove the no-header-test entry from user/include/Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/fsmap.h | 2 +-
 usr/include/Makefile       | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h
index 91fd519a3f7d..c690d17f1d07 100644
--- a/include/uapi/linux/fsmap.h
+++ b/include/uapi/linux/fsmap.h
@@ -69,7 +69,7 @@ struct fsmap_head {
 };
 
 /* Size of an fsmap_head with room for nr records. */
-static inline size_t
+static inline __kernel_size_t
 fsmap_sizeof(
 	unsigned int	nr)
 {
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 0872877c9457..3cd15af3266d 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -26,7 +26,6 @@ no-header-test += linux/am437x-vpfe.h
 no-header-test += linux/coda.h
 no-header-test += linux/cyclades.h
 no-header-test += linux/errqueue.h
-no-header-test += linux/fsmap.h
 no-header-test += linux/hdlc/ioctl.h
 no-header-test += linux/ivtv.h
 no-header-test += linux/kexec.h
-- 
cgit v1.2.3


From 8b4bca21c2c0cb3b5adb80985830a81d4e5d7081 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 10 Feb 2022 11:11:28 +0900
Subject: kexec.h: add linux/kexec.h to UAPI compile-test coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

linux/kexec.h is currently excluded from the UAPI compile-test because
of the errors like follows:

    HDRTEST usr/include/linux/kexec.h
  In file included from <command-line>:
  ./usr/include/linux/kexec.h:56:9: error: unknown type name ‘size_t’
     56 |         size_t bufsz;
        |         ^~~~~~
  ./usr/include/linux/kexec.h:58:9: error: unknown type name ‘size_t’
     58 |         size_t memsz;
        |         ^~~~~~

The errors can be fixed by replacing size_t with __kernel_size_t.

Then, remove the no-header-test entry from user/include/Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/kexec.h | 4 ++--
 usr/include/Makefile       | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 778dc191c265..fb7e2ef60825 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -54,9 +54,9 @@
  */
 struct kexec_segment {
 	const void *buf;
-	size_t bufsz;
+	__kernel_size_t bufsz;
 	const void *mem;
-	size_t memsz;
+	__kernel_size_t memsz;
 };
 
 #endif /* __KERNEL__ */
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 3cd15af3266d..0f59020ae5ef 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -28,7 +28,6 @@ no-header-test += linux/cyclades.h
 no-header-test += linux/errqueue.h
 no-header-test += linux/hdlc/ioctl.h
 no-header-test += linux/ivtv.h
-no-header-test += linux/kexec.h
 no-header-test += linux/matroxfb.h
 no-header-test += linux/omap3isp.h
 no-header-test += linux/omapfb.h
-- 
cgit v1.2.3


From 2a5c0fdc70cd653741e910e92ffeb2fa7376db07 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 10 Feb 2022 11:11:29 +0900
Subject: reiserfs_xattr.h: add linux/reiserfs_xattr.h to UAPI compile-test
 coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

linux/reiserfs_xattr.h is currently excluded from the UAPI compile-test
because of the error like follows:

    HDRTEST usr/include/linux/reiserfs_xattr.h
  In file included from <command-line>:
  ./usr/include/linux/reiserfs_xattr.h:22:9: error: unknown type name ‘size_t’
     22 |         size_t length;
        |         ^~~~~~

The error can be fixed by replacing size_t with __kernel_size_t.

Then, remove the no-header-test entry from user/include/Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/reiserfs_xattr.h | 2 +-
 usr/include/Makefile                | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h
index 28f10842f047..503ad018ce5b 100644
--- a/include/uapi/linux/reiserfs_xattr.h
+++ b/include/uapi/linux/reiserfs_xattr.h
@@ -19,7 +19,7 @@ struct reiserfs_xattr_header {
 struct reiserfs_security_handle {
 	const char *name;
 	void *value;
-	size_t length;
+	__kernel_size_t length;
 };
 
 #endif  /*  _LINUX_REISERFS_XATTR_H  */
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 0f59020ae5ef..7b283d43f00d 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -33,7 +33,6 @@ no-header-test += linux/omap3isp.h
 no-header-test += linux/omapfb.h
 no-header-test += linux/patchkey.h
 no-header-test += linux/phonet.h
-no-header-test += linux/reiserfs_xattr.h
 no-header-test += linux/sctp.h
 no-header-test += linux/sysctl.h
 no-header-test += linux/usb/audio.h
-- 
cgit v1.2.3


From d4568fc8525897e683983806f813be1ae9eedaed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 24 Jan 2022 18:29:52 +0100
Subject: media: omap3isp: Use struct_group() for memcpy() region

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).

FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:

omap3isp_stat_request_statistics_time32(...)
    struct omap3isp_stat_data data64;
    ...
    omap3isp_stat_request_statistics(stat, &data64);

int omap3isp_stat_request_statistics(struct ispstat *stat,
                                     struct omap3isp_stat_data *data)
    ...
    buf = isp_stat_buf_get(stat, data);

static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
                                               struct omap3isp_stat_data *data)
...
    if (buf->buf_size > data->buf_size) {
            ...
            return ERR_PTR(-EINVAL);
    }
    ...
    rval = copy_to_user(data->buf,
                        buf->virt_addr,
                        buf->buf_size);

Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.

Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor

Cc: Arnd Bergmann <arnd@arndb.de>
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable@vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/platform/omap3isp/ispstat.c |  5 +++--
 include/uapi/linux/omap3isp.h             | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
 int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
 					struct omap3isp_stat_data_time32 *data)
 {
-	struct omap3isp_stat_data data64;
+	struct omap3isp_stat_data data64 = { };
 	int ret;
 
 	ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
 
 	data->ts.tv_sec = data64.ts.tv_sec;
 	data->ts.tv_usec = data64.ts.tv_usec;
-	memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+	data->buf = (uintptr_t)data64.buf;
+	memcpy(&data->frame, &data64.frame, sizeof(data->frame));
 
 	return 0;
 }
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
  * struct omap3isp_stat_data - Statistic data sent to or received from user
  * @ts: Timestamp of returned framestats.
  * @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
  * @frame_number: Frame number of requested stats.
  * @cur_frame: Current frame number being processed.
  * @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
 	struct timeval ts;
 #endif
 	void __user *buf;
-	__u32 buf_size;
-	__u16 frame_number;
-	__u16 cur_frame;
-	__u16 config_counter;
+	__struct_group(/* no tag */, frame, /* no attrs */,
+		__u32 buf_size;
+		__u16 frame_number;
+		__u16 cur_frame;
+		__u16 config_counter;
+	);
 };
 
 #ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
 		__s32	tv_usec;
 	} ts;
 	__u32 buf;
-	__u32 buf_size;
-	__u16 frame_number;
-	__u16 cur_frame;
-	__u16 config_counter;
+	__struct_group(/* no tag */, frame, /* no attrs */,
+		__u32 buf_size;
+		__u16 frame_number;
+		__u16 cur_frame;
+		__u16 config_counter;
+	);
 };
 #endif
 
-- 
cgit v1.2.3


From 6de74d1069b821e96460d0fc2edfc35785db04fb Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Wed, 9 Feb 2022 08:11:10 -0800
Subject: hv_utils: Add comment about max VMbus packet size in VSS driver

The VSS driver allocates a VMbus receive buffer significantly
larger than sizeof(hv_vss_msg), with no explanation. To help
prevent future mistakes, add a #define and comment about why
this is done.

No functional change.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/1644423070-75125-1-git-send-email-mikelley@microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/hv_snapshot.c    |  7 +++++--
 include/uapi/linux/hyperv.h | 11 +++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 6018b9d1b1fb..0d2184be1691 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -31,6 +31,9 @@ static const int fw_versions[] = {
 	UTIL_FW_VERSION
 };
 
+/* See comment with struct hv_vss_msg regarding the max VMbus packet size */
+#define VSS_MAX_PKT_SIZE (HV_HYP_PAGE_SIZE * 2)
+
 /*
  * Timeout values are based on expecations from host
  */
@@ -298,7 +301,7 @@ void hv_vss_onchannelcallback(void *context)
 	if (vss_transaction.state > HVUTIL_READY)
 		return;
 
-	if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
+	if (vmbus_recvpacket(channel, recv_buffer, VSS_MAX_PKT_SIZE, &recvlen, &requestid)) {
 		pr_err_ratelimited("VSS request received. Could not read into recv buf\n");
 		return;
 	}
@@ -375,7 +378,7 @@ hv_vss_init(struct hv_util_service *srv)
 	}
 	recv_buffer = srv->recv_buffer;
 	vss_transaction.recv_channel = srv->channel;
-	vss_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
+	vss_transaction.recv_channel->max_pkt_size = VSS_MAX_PKT_SIZE;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h
index daf82a230c0e..aaa502a7bff4 100644
--- a/include/uapi/linux/hyperv.h
+++ b/include/uapi/linux/hyperv.h
@@ -90,6 +90,17 @@ struct hv_vss_check_dm_info {
 	__u32 flags;
 } __attribute__((packed));
 
+/*
+ * struct hv_vss_msg encodes the fields that the Linux VSS
+ * driver accesses. However, FREEZE messages from Hyper-V contain
+ * additional LUN information that Linux doesn't use and are not
+ * represented in struct hv_vss_msg. A received FREEZE message may
+ * be as large as 6,260 bytes, so the driver must allocate at least
+ * that much space, not sizeof(struct hv_vss_msg). Other messages
+ * such as AUTO_RECOVER may be as large as 12,500 bytes. However,
+ * because the Linux VSS driver responds that it doesn't support
+ * auto-recovery, it should not receive such messages.
+ */
 struct hv_vss_msg {
 	union {
 		struct hv_vss_hdr vss_hdr;
-- 
cgit v1.2.3


From 47f0bd5032106469827cf56c8b45bb9101112105 Mon Sep 17 00:00:00 2001
From: Jacques de Laval <Jacques.De.Laval@westermo.com>
Date: Thu, 17 Feb 2022 16:02:02 +0100
Subject: net: Add new protocol attribute to IP addresses

This patch adds a new protocol attribute to IPv4 and IPv6 addresses.
Inspiration was taken from the protocol attribute of routes. User space
applications like iproute2 can set/get the protocol with the Netlink API.

The attribute is stored as an 8-bit unsigned integer.

The protocol attribute is set by kernel for these categories:

- IPv4 and IPv6 loopback addresses
- IPv6 addresses generated from router announcements
- IPv6 link local addresses

User space may pass custom protocols, not defined by the kernel.

Grouping addresses on their origin is useful in scenarios where you want
to distinguish between addresses based on who added them, e.g. kernel
vs. user space.

Tagging addresses with a string label is an existing feature that could be
used as a solution. Unfortunately the max length of a label is
15 characters, and for compatibility reasons the label must be prefixed
with the name of the device followed by a colon. Since device names also
have a max length of 15 characters, only -1 characters is guaranteed to be
available for any origin tag, which is not that much.

A reference implementation of user space setting and getting protocols
is available for iproute2:

https://github.com/westermo/iproute2/commit/9a6ea18bd79f47f293e5edc7780f315ea42ff540

Signed-off-by: Jacques de Laval <Jacques.De.Laval@westermo.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220217150202.80802-1-Jacques.De.Laval@westermo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h   |  1 +
 include/net/addrconf.h       |  2 ++
 include/net/if_inet6.h       |  2 ++
 include/uapi/linux/if_addr.h |  9 ++++++++-
 net/ipv4/devinet.c           |  7 +++++++
 net/ipv6/addrconf.c          | 27 +++++++++++++++++++++------
 6 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 674aeead6260..ead323243e7b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -150,6 +150,7 @@ struct in_ifaddr {
 	__be32			ifa_broadcast;
 	unsigned char		ifa_scope;
 	unsigned char		ifa_prefixlen;
+	unsigned char		ifa_proto;
 	__u32			ifa_flags;
 	char			ifa_label[IFNAMSIZ];
 
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 59940e230b78..f7506f08e505 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -64,6 +64,8 @@ struct ifa6_config {
 	const struct in6_addr	*pfx;
 	unsigned int		plen;
 
+	u8			ifa_proto;
+
 	const struct in6_addr	*peer_pfx;
 
 	u32			rt_priority;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index f026cf08a8e8..4cfdef6ca4f6 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -71,6 +71,8 @@ struct inet6_ifaddr {
 
 	bool			tokenized;
 
+	u8			ifa_proto;
+
 	struct rcu_head		rcu;
 	struct in6_addr		peer_addr;
 };
diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index dfcf3ce0097f..1c392dd95a5e 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,8 +33,9 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
-	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
+	IFA_RT_PRIORITY,	/* u32, priority/metric for prefix route */
 	IFA_TARGET_NETNSID,
+	IFA_PROTO,		/* u8, address protocol */
 	__IFA_MAX,
 };
 
@@ -69,4 +70,10 @@ struct ifa_cacheinfo {
 #define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg))
 #endif
 
+/* ifa_proto */
+#define IFAPROT_UNSPEC		0
+#define IFAPROT_KERNEL_LO	1	/* loopback */
+#define IFAPROT_KERNEL_RA	2	/* set by kernel from router announcement */
+#define IFAPROT_KERNEL_LL	3	/* link-local set by kernel */
+
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index fba2bffd65f7..53a6b14dc50a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -104,6 +104,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_FLAGS]		= { .type = NLA_U32 },
 	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
 	[IFA_TARGET_NETNSID]	= { .type = NLA_S32 },
+	[IFA_PROTO]		= { .type = NLA_U8 },
 };
 
 struct inet_fill_args {
@@ -889,6 +890,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 	if (tb[IFA_RT_PRIORITY])
 		ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
 
+	if (tb[IFA_PROTO])
+		ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
@@ -1625,6 +1629,7 @@ static size_t inet_nlmsg_size(void)
 	       + nla_total_size(4) /* IFA_BROADCAST */
 	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
 	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(1)  /* IFA_PROTO */
 	       + nla_total_size(4)  /* IFA_RT_PRIORITY */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
 }
@@ -1699,6 +1704,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	     nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
 	    (ifa->ifa_label[0] &&
 	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+	    (ifa->ifa_proto &&
+	     nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
 	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
 	    (ifa->ifa_rt_priority &&
 	     nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2f307da17f21..85bab3a35709 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1115,6 +1115,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	ifa->prefix_len = cfg->plen;
 	ifa->rt_priority = cfg->rt_priority;
 	ifa->flags = cfg->ifa_flags;
+	ifa->ifa_proto = cfg->ifa_proto;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
 	if (!(cfg->ifa_flags & IFA_F_NODAD))
 		ifa->flags |= IFA_F_TENTATIVE;
@@ -2593,6 +2594,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 			.valid_lft = valid_lft,
 			.preferred_lft = prefered_lft,
 			.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+			.ifa_proto = IFAPROT_KERNEL_RA
 		};
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -3077,7 +3079,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
 }
 
 static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
-		     int plen, int scope)
+		     int plen, int scope, u8 proto)
 {
 	struct inet6_ifaddr *ifp;
 	struct ifa6_config cfg = {
@@ -3086,7 +3088,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		.ifa_flags = IFA_F_PERMANENT,
 		.valid_lft = INFINITY_LIFE_TIME,
 		.preferred_lft = INFINITY_LIFE_TIME,
-		.scope = scope
+		.scope = scope,
+		.ifa_proto = proto
 	};
 
 	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
@@ -3131,7 +3134,7 @@ static void add_v4_addrs(struct inet6_dev *idev)
 	}
 
 	if (addr.s6_addr32[3]) {
-		add_addr(idev, &addr, plen, scope);
+		add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
 		addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
 				      GFP_KERNEL);
 		return;
@@ -3154,7 +3157,8 @@ static void add_v4_addrs(struct inet6_dev *idev)
 					flag |= IFA_HOST;
 				}
 
-				add_addr(idev, &addr, plen, flag);
+				add_addr(idev, &addr, plen, flag,
+					 IFAPROT_UNSPEC);
 				addrconf_prefix_route(&addr, plen, 0, idev->dev,
 						      0, pflags, GFP_KERNEL);
 			}
@@ -3177,7 +3181,7 @@ static void init_loopback(struct net_device *dev)
 		return;
 	}
 
-	add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+	add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
 }
 
 void addrconf_add_linklocal(struct inet6_dev *idev,
@@ -3189,7 +3193,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 		.ifa_flags = flags | IFA_F_PERMANENT,
 		.valid_lft = INFINITY_LIFE_TIME,
 		.preferred_lft = INFINITY_LIFE_TIME,
-		.scope = IFA_LINK
+		.scope = IFA_LINK,
+		.ifa_proto = IFAPROT_KERNEL_LL
 	};
 	struct inet6_ifaddr *ifp;
 
@@ -4627,6 +4632,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
 	[IFA_FLAGS]		= { .len = sizeof(u32) },
 	[IFA_RT_PRIORITY]	= { .len = sizeof(u32) },
 	[IFA_TARGET_NETNSID]	= { .type = NLA_S32 },
+	[IFA_PROTO]		= { .type = NLA_U8 },
 };
 
 static int
@@ -4752,6 +4758,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
 	ifp->tstamp = jiffies;
 	ifp->valid_lft = cfg->valid_lft;
 	ifp->prefered_lft = cfg->preferred_lft;
+	ifp->ifa_proto = cfg->ifa_proto;
 
 	if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
 		ifp->rt_priority = cfg->rt_priority;
@@ -4845,6 +4852,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[IFA_RT_PRIORITY])
 		cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
 
+	if (tb[IFA_PROTO])
+		cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
 	cfg.valid_lft = INFINITY_LIFE_TIME;
 	cfg.preferred_lft = INFINITY_LIFE_TIME;
 
@@ -4948,6 +4958,7 @@ static inline int inet6_ifaddr_msgsize(void)
 	       + nla_total_size(16) /* IFA_ADDRESS */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo))
 	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(1)  /* IFA_PROTO */
 	       + nla_total_size(4)  /* IFA_RT_PRIORITY */;
 }
 
@@ -5025,6 +5036,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 	if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0)
 		goto error;
 
+	if (ifa->ifa_proto &&
+	    nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto))
+		goto error;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From 4b340a5a726dafba15b366c4009aa0a8f77631ac Mon Sep 17 00:00:00 2001
From: Mobashshera Rasool <mobash.rasool.linux@gmail.com>
Date: Thu, 17 Feb 2022 07:46:40 +0000
Subject: net: ip6mr: add support for passing full packet on wrong mif

This patch adds support for MRT6MSG_WRMIFWHOLE which is used to pass
full packet and real vif id when the incoming interface is wrong.
While the RP and FHR are setting up state we need to be sending the
registers encapsulated with all the data inside otherwise we lose it.
The RP then decapsulates it and forwards it to the interested parties.
Currently with WRONGMIF we can only be sending empty register packets
and will lose that data.
This behaviour can be enabled by using MRT_PIM with
val == MRT6MSG_WRMIFWHOLE. This doesn't prevent MRT6MSG_WRONGMIF from
happening, it happens in addition to it, also it is controlled by the same
throttling parameters as WRONGMIF (i.e. 1 packet per 3 seconds currently).
Both messages are generated to keep backwards compatibily and avoid
breaking someone who was enabling MRT_PIM with val == 4, since any
positive val is accepted and treated the same.

Signed-off-by: Mobashshera Rasool <mobash.rasool.linux@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mroute6.h |  1 +
 net/ipv6/ip6mr.c             | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index a1fd6173e2db..1d90c21a6251 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -134,6 +134,7 @@ struct mrt6msg {
 #define MRT6MSG_NOCACHE		1
 #define MRT6MSG_WRONGMIF	2
 #define MRT6MSG_WHOLEPKT	3		/* used for use level encap */
+#define MRT6MSG_WRMIFWHOLE	4		/* For PIM Register and assert processing */
 	__u8		im6_mbz;		/* must be zero		   */
 	__u8		im6_msgtype;		/* what type of message    */
 	__u16		im6_mif;		/* mif rec'd on		   */
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0ebaaec3faf9..a9775c830194 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1040,7 +1040,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 	int ret;
 
 #ifdef CONFIG_IPV6_PIMSM_V2
-	if (assert == MRT6MSG_WHOLEPKT)
+	if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE)
 		skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
 						+sizeof(*msg));
 	else
@@ -1056,7 +1056,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 #ifdef CONFIG_IPV6_PIMSM_V2
-	if (assert == MRT6MSG_WHOLEPKT) {
+	if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) {
 		/* Ugly, but we have no choice with this interface.
 		   Duplicate old header, fix length etc.
 		   And all this only to mangle msg->im6_msgtype and
@@ -1068,8 +1068,11 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 		skb_reset_transport_header(skb);
 		msg = (struct mrt6msg *)skb_transport_header(skb);
 		msg->im6_mbz = 0;
-		msg->im6_msgtype = MRT6MSG_WHOLEPKT;
-		msg->im6_mif = mrt->mroute_reg_vif_num;
+		msg->im6_msgtype = assert;
+		if (assert == MRT6MSG_WRMIFWHOLE)
+			msg->im6_mif = mifi;
+		else
+			msg->im6_mif = mrt->mroute_reg_vif_num;
 		msg->im6_pad = 0;
 		msg->im6_src = ipv6_hdr(pkt)->saddr;
 		msg->im6_dst = ipv6_hdr(pkt)->daddr;
@@ -1650,6 +1653,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 	mifi_t mifi;
 	struct net *net = sock_net(sk);
 	struct mr_table *mrt;
+	bool do_wrmifwhole;
 
 	if (sk->sk_type != SOCK_RAW ||
 	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
@@ -1763,12 +1767,15 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			return -EINVAL;
 		if (copy_from_sockptr(&v, optval, sizeof(v)))
 			return -EFAULT;
+
+		do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE);
 		v = !!v;
 		rtnl_lock();
 		ret = 0;
 		if (v != mrt->mroute_do_pim) {
 			mrt->mroute_do_pim = v;
 			mrt->mroute_do_assert = v;
+			mrt->mroute_do_wrvifwhole = do_wrmifwhole;
 		}
 		rtnl_unlock();
 		return ret;
@@ -2144,6 +2151,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 			       MFC_ASSERT_THRESH)) {
 			c->_c.mfc_un.res.last_assert = jiffies;
 			ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+			if (mrt->mroute_do_wrvifwhole)
+				ip6mr_cache_report(mrt, skb, true_vifi,
+						   MRT6MSG_WRMIFWHOLE);
 		}
 		goto dont_forward;
 	}
-- 
cgit v1.2.3


From a1dc6308865df719efb2a2f8a5f0f5979602d267 Mon Sep 17 00:00:00 2001
From: Amitay Isaacs <amitay@ozlabs.org>
Date: Fri, 21 Jan 2022 16:08:16 +1030
Subject: fsi: sbefifo: Implement FSI_SBEFIFO_READ_TIMEOUT_SECONDS ioctl

FSI_SBEFIFO_READ_TIMEOUT_SECONDS ioctl sets the read timeout (in
seconds) for the response received by sbefifo device from sbe.  The
timeout affects only the read operation on current sbefifo device fd.

Certain SBE operations can take long time to complete and the default
timeout of 10 seconds might not be sufficient to start receiving
response from SBE.  In such cases, allow the timeout to be set to the
maximum of 120 seconds.

The kernel does not contain the definition of the various SBE
operations, so we must expose an interface to userspace to set the
timeout for the given operation.

Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
Signed-off-by: Joel Stanley <joel@jms.id.au>
Reviewed-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/20220121053816.82253-3-joel@jms.id.au
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 drivers/fsi/fsi-sbefifo.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fsi.h  | 14 ++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c
index 1e9b326e8f67..f52a912cdf16 100644
--- a/drivers/fsi/fsi-sbefifo.c
+++ b/drivers/fsi/fsi-sbefifo.c
@@ -32,6 +32,8 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 
+#include <uapi/linux/fsi.h>
+
 /*
  * The SBEFIFO is a pipe-like FSI device for communicating with
  * the self boot engine on POWER processors.
@@ -134,6 +136,7 @@ struct sbefifo_user {
 	void			*cmd_page;
 	void			*pending_cmd;
 	size_t			pending_len;
+	u32			read_timeout_ms;
 };
 
 static DEFINE_MUTEX(sbefifo_ffdc_mutex);
@@ -796,6 +799,7 @@ static int sbefifo_user_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	}
 	mutex_init(&user->file_lock);
+	user->read_timeout_ms = SBEFIFO_TIMEOUT_START_RSP;
 
 	return 0;
 }
@@ -838,7 +842,9 @@ static ssize_t sbefifo_user_read(struct file *file, char __user *buf,
 	rc = mutex_lock_interruptible(&sbefifo->lock);
 	if (rc)
 		goto bail;
+	sbefifo->timeout_start_rsp_ms = user->read_timeout_ms;
 	rc = __sbefifo_submit(sbefifo, user->pending_cmd, cmd_len, &resp_iter);
+	sbefifo->timeout_start_rsp_ms = SBEFIFO_TIMEOUT_START_RSP;
 	mutex_unlock(&sbefifo->lock);
 	if (rc < 0)
 		goto bail;
@@ -928,12 +934,55 @@ static int sbefifo_user_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int sbefifo_read_timeout(struct sbefifo_user *user, void __user *argp)
+{
+	struct device *dev = &user->sbefifo->dev;
+	u32 timeout;
+
+	if (get_user(timeout, (__u32 __user *)argp))
+		return -EFAULT;
+
+	if (timeout == 0) {
+		user->read_timeout_ms = SBEFIFO_TIMEOUT_START_RSP;
+		dev_dbg(dev, "Timeout reset to %d\n", user->read_timeout_ms);
+		return 0;
+	}
+
+	if (timeout < 10 || timeout > 120)
+		return -EINVAL;
+
+	user->read_timeout_ms = timeout * 1000; /* user timeout is in sec */
+
+	dev_dbg(dev, "Timeout set to %d\n", user->read_timeout_ms);
+
+	return 0;
+}
+
+static long sbefifo_user_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct sbefifo_user *user = file->private_data;
+	int rc = -ENOTTY;
+
+	if (!user)
+		return -EINVAL;
+
+	mutex_lock(&user->file_lock);
+	switch (cmd) {
+	case FSI_SBEFIFO_READ_TIMEOUT_SECONDS:
+		rc = sbefifo_read_timeout(user, (void __user *)arg);
+		break;
+	}
+	mutex_unlock(&user->file_lock);
+	return rc;
+}
+
 static const struct file_operations sbefifo_fops = {
 	.owner		= THIS_MODULE,
 	.open		= sbefifo_user_open,
 	.read		= sbefifo_user_read,
 	.write		= sbefifo_user_write,
 	.release	= sbefifo_user_release,
+	.unlocked_ioctl = sbefifo_user_ioctl,
 };
 
 static void sbefifo_free(struct device *dev)
diff --git a/include/uapi/linux/fsi.h b/include/uapi/linux/fsi.h
index da577ecd90e7..b2f1977378c7 100644
--- a/include/uapi/linux/fsi.h
+++ b/include/uapi/linux/fsi.h
@@ -55,4 +55,18 @@ struct scom_access {
 #define FSI_SCOM_WRITE	_IOWR('s', 0x02, struct scom_access)
 #define FSI_SCOM_RESET	_IOW('s', 0x03, __u32)
 
+/*
+ * /dev/sbefifo* ioctl interface
+ */
+
+/**
+ * FSI_SBEFIFO_READ_TIMEOUT sets the read timeout for response from SBE.
+ *
+ * The read timeout is specified in seconds.  The minimum value of read
+ * timeout is 10 seconds (default) and the maximum value of read timeout is
+ * 120 seconds.  A read timeout of 0 will reset the value to the default of
+ * (10 seconds).
+ */
+#define FSI_SBEFIFO_READ_TIMEOUT_SECONDS	_IOW('s', 0x00, __u32)
+
 #endif /* _UAPI_LINUX_FSI_H */
-- 
cgit v1.2.3


From 129e3c1bab24d27d0fa6e505a472345a92d7a2b0 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 21 Feb 2022 13:54:57 +0800
Subject: bonding: add new option ns_ip6_target

This patch add a new bonding option ns_ip6_target, which correspond
to the arp_ip_target. With this we set IPv6 targets and send IPv6 NS
request to determine the health of the link.

For other related options like the validation, we still use
arp_validate, and will change to ns_validate later.

Note: the sysfs configuration support was removed based on
https://lore.kernel.org/netdev/8863.1645071997@famine

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 11 ++++++
 drivers/net/bonding/bond_netlink.c   | 59 +++++++++++++++++++++++++++++
 drivers/net/bonding/bond_options.c   | 72 ++++++++++++++++++++++++++++++++++++
 include/net/bond_options.h           |  4 ++
 include/net/bonding.h                |  7 ++++
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 7 files changed, 155 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index ab98373535ea..525e6842dd33 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -313,6 +313,17 @@ arp_ip_target
 	maximum number of targets that can be specified is 16.  The
 	default value is no IP addresses.
 
+ns_ip6_target
+
+	Specifies the IPv6 addresses to use as IPv6 monitoring peers when
+	arp_interval is > 0.  These are the targets of the NS request
+	sent to determine the health of the link to the targets.
+	Specify these values in ffff:ffff::ffff:ffff format.  Multiple IPv6
+	addresses must be separated by a comma.  At least one IPv6
+	address must be given for NS/NA monitoring to function.  The
+	maximum number of targets that can be specified is 16.  The
+	default value is no IPv6 addresses.
+
 arp_validate
 
 	Specifies whether or not ARP probes and replies should be
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 1007bf6d385d..f427fa1737c7 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -14,6 +14,7 @@
 #include <net/netlink.h>
 #include <net/rtnetlink.h>
 #include <net/bonding.h>
+#include <net/ipv6.h>
 
 static size_t bond_get_slave_size(const struct net_device *bond_dev,
 				  const struct net_device *slave_dev)
@@ -111,6 +112,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_TLB_DYNAMIC_LB]	= { .type = NLA_U8 },
 	[IFLA_BOND_PEER_NOTIF_DELAY]    = { .type = NLA_U32 },
 	[IFLA_BOND_MISSED_MAX]		= { .type = NLA_U8 },
+	[IFLA_BOND_NS_IP6_TARGET]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -272,6 +274,40 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 		if (err)
 			return err;
 	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (data[IFLA_BOND_NS_IP6_TARGET]) {
+		struct nlattr *attr;
+		int i = 0, rem;
+
+		bond_option_ns_ip6_targets_clear(bond);
+		nla_for_each_nested(attr, data[IFLA_BOND_NS_IP6_TARGET], rem) {
+			struct in6_addr addr6;
+
+			if (nla_len(attr) < sizeof(addr6)) {
+				NL_SET_ERR_MSG(extack, "Invalid IPv6 address");
+				return -EINVAL;
+			}
+
+			addr6 = nla_get_in6_addr(attr);
+
+			if (ipv6_addr_type(&addr6) & IPV6_ADDR_LINKLOCAL) {
+				NL_SET_ERR_MSG(extack, "Invalid IPv6 addr6");
+				return -EINVAL;
+			}
+
+			bond_opt_initextra(&newval, &addr6, sizeof(addr6));
+			err = __bond_opt_set(bond, BOND_OPT_NS_TARGETS,
+					     &newval);
+			if (err)
+				break;
+			i++;
+		}
+		if (i == 0 && bond->params.arp_interval)
+			netdev_warn(bond->dev, "Removing last ns target with arp_interval on\n");
+		if (err)
+			return err;
+	}
+#endif
 	if (data[IFLA_BOND_ARP_VALIDATE]) {
 		int arp_validate = nla_get_u32(data[IFLA_BOND_ARP_VALIDATE]);
 
@@ -526,6 +562,9 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */
 		nla_total_size(sizeof(u32)) +	/* IFLA_BOND_PEER_NOTIF_DELAY */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_MISSED_MAX */
+						/* IFLA_BOND_NS_IP6_TARGET */
+		nla_total_size(sizeof(struct nlattr)) +
+		nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS +
 		0;
 }
 
@@ -603,6 +642,26 @@ static int bond_fill_info(struct sk_buff *skb,
 			bond->params.arp_all_targets))
 		goto nla_put_failure;
 
+#if IS_ENABLED(CONFIG_IPV6)
+	targets = nla_nest_start(skb, IFLA_BOND_NS_IP6_TARGET);
+	if (!targets)
+		goto nla_put_failure;
+
+	targets_added = 0;
+	for (i = 0; i < BOND_MAX_NS_TARGETS; i++) {
+		if (!ipv6_addr_any(&bond->params.ns_targets[i])) {
+			if (nla_put_in6_addr(skb, i, &bond->params.ns_targets[i]))
+				goto nla_put_failure;
+			targets_added = 1;
+		}
+	}
+
+	if (targets_added)
+		nla_nest_end(skb, targets);
+	else
+		nla_nest_cancel(skb, targets);
+#endif
+
 	primary = rtnl_dereference(bond->primary_slave);
 	if (primary &&
 	    nla_put_u32(skb, IFLA_BOND_PRIMARY, primary->dev->ifindex))
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index ab575135b626..64f7db2627ce 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -34,6 +34,10 @@ static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target);
 static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target);
 static int bond_option_arp_ip_targets_set(struct bonding *bond,
 					  const struct bond_opt_value *newval);
+#if IS_ENABLED(CONFIG_IPV6)
+static int bond_option_ns_ip6_targets_set(struct bonding *bond,
+					  const struct bond_opt_value *newval);
+#endif
 static int bond_option_arp_validate_set(struct bonding *bond,
 					const struct bond_opt_value *newval);
 static int bond_option_arp_all_targets_set(struct bonding *bond,
@@ -295,6 +299,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.flags = BOND_OPTFLAG_RAWVAL,
 		.set = bond_option_arp_ip_targets_set
 	},
+#if IS_ENABLED(CONFIG_IPV6)
+	[BOND_OPT_NS_TARGETS] = {
+		.id = BOND_OPT_NS_TARGETS,
+		.name = "ns_ip6_target",
+		.desc = "NS targets in ffff:ffff::ffff:ffff form",
+		.flags = BOND_OPTFLAG_RAWVAL,
+		.set = bond_option_ns_ip6_targets_set
+	},
+#endif
 	[BOND_OPT_DOWNDELAY] = {
 		.id = BOND_OPT_DOWNDELAY,
 		.name = "downdelay",
@@ -1184,6 +1197,65 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond,
 	return ret;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot,
+					    struct in6_addr *target,
+					    unsigned long last_rx)
+{
+	struct in6_addr *targets = bond->params.ns_targets;
+	struct list_head *iter;
+	struct slave *slave;
+
+	if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) {
+		bond_for_each_slave(bond, slave, iter)
+			slave->target_last_arp_rx[slot] = last_rx;
+		targets[slot] = *target;
+	}
+}
+
+void bond_option_ns_ip6_targets_clear(struct bonding *bond)
+{
+	struct in6_addr addr_any = in6addr_any;
+	int i;
+
+	for (i = 0; i < BOND_MAX_NS_TARGETS; i++)
+		_bond_options_ns_ip6_target_set(bond, i, &addr_any, 0);
+}
+
+static int bond_option_ns_ip6_targets_set(struct bonding *bond,
+					  const struct bond_opt_value *newval)
+{
+	struct in6_addr *target = (struct in6_addr *)newval->extra;
+	struct in6_addr *targets = bond->params.ns_targets;
+	struct in6_addr addr_any = in6addr_any;
+	int index;
+
+	if (!bond_is_ip6_target_ok(target)) {
+		netdev_err(bond->dev, "invalid NS target %pI6c specified for addition\n",
+			   target);
+		return -EINVAL;
+	}
+
+	if (bond_get_targets_ip6(targets, target) != -1) { /* dup */
+		netdev_err(bond->dev, "NS target %pI6c is already present\n",
+			   target);
+		return -EINVAL;
+	}
+
+	index = bond_get_targets_ip6(targets, &addr_any); /* first free slot */
+	if (index == -1) {
+		netdev_err(bond->dev, "NS target table is full!\n");
+		return -EINVAL;
+	}
+
+	netdev_dbg(bond->dev, "Adding NS target %pI6c\n", target);
+
+	_bond_options_ns_ip6_target_set(bond, index, target, jiffies);
+
+	return 0;
+}
+#endif
+
 static int bond_option_arp_validate_set(struct bonding *bond,
 					const struct bond_opt_value *newval)
 {
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 286b29c6c451..61b49063791c 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -66,6 +66,7 @@ enum {
 	BOND_OPT_PEER_NOTIF_DELAY,
 	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_MISSED_MAX,
+	BOND_OPT_NS_TARGETS,
 	BOND_OPT_LAST
 };
 
@@ -140,5 +141,8 @@ static inline void __bond_opt_init(struct bond_opt_value *optval,
 	__bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len)
 
 void bond_option_arp_ip_targets_clear(struct bonding *bond);
+#if IS_ENABLED(CONFIG_IPV6)
+void bond_option_ns_ip6_targets_clear(struct bonding *bond);
+#endif
 
 #endif /* _NET_BOND_OPTIONS_H */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f3b986f6b6e4..d0dfe727e0b1 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -503,6 +503,13 @@ static inline int bond_is_ip_target_ok(__be32 addr)
 	return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr);
 }
 
+static inline int bond_is_ip6_target_ok(struct in6_addr *addr)
+{
+	return !ipv6_addr_any(addr) &&
+	       !ipv6_addr_loopback(addr) &&
+	       !ipv6_addr_is_multicast(addr);
+}
+
 /* Get the oldest arp which we've received on this slave for bond's
  * arp_targets.
  */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6218f93f5c1a..e1ba2d51b717 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -860,6 +860,7 @@ enum {
 	IFLA_BOND_PEER_NOTIF_DELAY,
 	IFLA_BOND_AD_LACP_ACTIVE,
 	IFLA_BOND_MISSED_MAX,
+	IFLA_BOND_NS_IP6_TARGET,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 6218f93f5c1a..e1ba2d51b717 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -860,6 +860,7 @@ enum {
 	IFLA_BOND_PEER_NOTIF_DELAY,
 	IFLA_BOND_AD_LACP_ACTIVE,
 	IFLA_BOND_MISSED_MAX,
+	IFLA_BOND_NS_IP6_TARGET,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From c086df4902573e2f06c6a2a83452c13a8bc603f5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 10 Jan 2022 18:52:52 -0500
Subject: fuse: move FUSE_SUPER_MAGIC definition to magic.h

...to help userland apps that need to identify FUSE mounts.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/inode.c            | 3 +--
 include/uapi/linux/magic.h | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ee846ce371d8..9ee36aa73251 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@
 #include <linux/exportfs.h>
 #include <linux/posix_acl.h>
 #include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh,
  "Global limit for the maximum congestion threshold an "
  "unprivileged user can set");
 
-#define FUSE_SUPER_MAGIC 0x65735546
-
 #define FUSE_DEFAULT_BLKSIZE 512
 
 /** Maximum number of outstanding background requests */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 0425cd79af9a..f724129c0425 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -36,6 +36,7 @@
 #define EFIVARFS_MAGIC		0xde5e81e4
 #define HOSTFS_SUPER_MAGIC	0x00c0ffee
 #define OVERLAYFS_SUPER_MAGIC	0x794c7630
+#define FUSE_SUPER_MAGIC	0x65735546
 
 #define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
-- 
cgit v1.2.3


From d43583b890e7cb0078d13d056753a56602b92406 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 21 Feb 2022 15:35:23 +0000
Subject: KVM: arm64: Expose PSCI SYSTEM_RESET2 call to the guest

PSCI v1.1 introduces the optional SYSTEM_RESET2 call, which allows the
caller to provide a vendor-specific "reset type" and "cookie" to request
a particular form of reset or shutdown.

Expose this call to the guest and handle it in the same way as PSCI
SYSTEM_RESET, along with some basic range checking on the type argument.

Cc: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Alexandru Elisei <alexandru.elisei@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220221153524.15397-3-will@kernel.org
---
 arch/arm64/kvm/psci.c     | 33 +++++++++++++++++++++++++++++----
 include/uapi/linux/psci.h |  4 ++++
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 70d07477e712..30fcc5a99483 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -308,7 +308,7 @@ out:
 static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 {
 	u32 psci_fn = smccc_get_function(vcpu);
-	u32 feature;
+	u32 arg;
 	unsigned long val;
 	int ret = 1;
 
@@ -320,12 +320,12 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 		val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1;
 		break;
 	case PSCI_1_0_FN_PSCI_FEATURES:
-		feature = smccc_get_arg1(vcpu);
-		val = kvm_psci_check_allowed_function(vcpu, feature);
+		arg = smccc_get_arg1(vcpu);
+		val = kvm_psci_check_allowed_function(vcpu, arg);
 		if (val)
 			break;
 
-		switch(feature) {
+		switch(arg) {
 		case PSCI_0_2_FN_PSCI_VERSION:
 		case PSCI_0_2_FN_CPU_SUSPEND:
 		case PSCI_0_2_FN64_CPU_SUSPEND:
@@ -341,11 +341,36 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 		case ARM_SMCCC_VERSION_FUNC_ID:
 			val = 0;
 			break;
+		case PSCI_1_1_FN_SYSTEM_RESET2:
+		case PSCI_1_1_FN64_SYSTEM_RESET2:
+			if (minor >= 1) {
+				val = 0;
+				break;
+			}
+			fallthrough;
 		default:
 			val = PSCI_RET_NOT_SUPPORTED;
 			break;
 		}
 		break;
+	case PSCI_1_1_FN_SYSTEM_RESET2:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_1_1_FN64_SYSTEM_RESET2:
+		if (minor >= 1) {
+			arg = smccc_get_arg1(vcpu);
+
+			if (arg > PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET &&
+			    arg < PSCI_1_1_RESET_TYPE_VENDOR_START) {
+				val = PSCI_RET_INVALID_PARAMS;
+			} else {
+				kvm_psci_system_reset(vcpu);
+				val = PSCI_RET_INTERNAL_FAILURE;
+				ret = 0;
+			}
+			break;
+		};
+		fallthrough;
 	default:
 		return kvm_psci_0_2_call(vcpu);
 	}
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 2fcad1dd0b0e..2bf93c0d6354 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -82,6 +82,10 @@
 #define PSCI_0_2_TOS_UP_NO_MIGRATE		1
 #define PSCI_0_2_TOS_MP				2
 
+/* PSCI v1.1 reset type encoding for SYSTEM_RESET2 */
+#define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET	0
+#define PSCI_1_1_RESET_TYPE_VENDOR_START	0x80000000U
+
 /* PSCI version decoding (independent of PSCI version) */
 #define PSCI_VERSION_MAJOR_SHIFT		16
 #define PSCI_VERSION_MINOR_MASK			\
-- 
cgit v1.2.3


From a603ca60cebff8589882427a67f870ed946b3fc8 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Thu, 10 Feb 2022 16:42:03 -0800
Subject: serial: 8250_aspeed_vuart: add PORT_ASPEED_VUART port type

Commit 54da3e381c2b ("serial: 8250_aspeed_vuart: use UPF_IOREMAP to
set up register mapping") fixed a bug that had, as a side-effect,
prevented the 8250_aspeed_vuart driver from enabling the VUART's
FIFOs.  However, fixing that (and hence enabling the FIFOs) has in
turn revealed what appears to be a hardware bug in the ASPEED VUART in
which the host-side THRE bit doesn't get if the BMC-side receive FIFO
trigger level is set to anything but one byte.  This causes problems
for polled-mode writes from the host -- for example, Linux kernel
console writes proceed at a glacial pace (less than 100 bytes per
second) because the write path waits for a 10ms timeout to expire
after every character instead of being able to continue on to the next
character upon seeing THRE asserted.  (GRUB behaves similarly.)

As a workaround, introduce a new port type for the ASPEED VUART that's
identical to PORT_16550A as it had previously been using, but with
UART_FCR_R_TRIG_00 instead to set the receive FIFO trigger level to
one byte, which (experimentally) seems to avoid the problematic THRE
behavior.

Fixes: 54da3e381c2b ("serial: 8250_aspeed_vuart: use UPF_IOREMAP to set up register mapping")
Tested-by: Konstantin Aladyshev <aladyshev22@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Link: https://lore.kernel.org/r/20220211004203.14915-1-zev@bewilderbeest.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_aspeed_vuart.c | 2 +-
 drivers/tty/serial/8250/8250_port.c         | 8 ++++++++
 include/uapi/linux/serial_core.h            | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 2350fb3bb5e4..c2cecc6f47db 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -487,7 +487,7 @@ static int aspeed_vuart_probe(struct platform_device *pdev)
 	port.port.irq = irq_of_parse_and_map(np, 0);
 	port.port.handle_irq = aspeed_vuart_handle_irq;
 	port.port.iotype = UPIO_MEM;
-	port.port.type = PORT_16550A;
+	port.port.type = PORT_ASPEED_VUART;
 	port.port.uartclk = clk;
 	port.port.flags = UPF_SHARE_IRQ | UPF_BOOT_AUTOCONF | UPF_IOREMAP
 		| UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_NO_THRE_TEST;
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 3b12bfc1ed67..973870ebff69 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -307,6 +307,14 @@ static const struct serial8250_config uart_config[] = {
 		.rxtrig_bytes	= {1, 32, 64, 112},
 		.flags		= UART_CAP_FIFO | UART_CAP_SLEEP,
 	},
+	[PORT_ASPEED_VUART] = {
+		.name		= "ASPEED VUART",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+		.rxtrig_bytes	= {1, 4, 8, 14},
+		.flags		= UART_CAP_FIFO,
+	},
 };
 
 /* Uart divisor latch read */
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c4042dcfdc0c..8885e69178bd 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -68,6 +68,9 @@
 /* NVIDIA Tegra Combined UART */
 #define PORT_TEGRA_TCU	41
 
+/* ASPEED AST2x00 virtual UART */
+#define PORT_ASPEED_VUART	42
+
 /* Intel EG20 */
 #define PORT_PCH_8LINE	44
 #define PORT_PCH_2LINE	45
-- 
cgit v1.2.3


From a1a5cfe70cd29a59a9a85290dfe95ed1c8df1193 Mon Sep 17 00:00:00 2001
From: Cosmin Tanislav <demonsingur@gmail.com>
Date: Mon, 14 Feb 2022 09:38:06 +0200
Subject: iio: introduce mag_referenced

Some accelerometers that support activity and inactivity
events also support a referenced mode, in which the
gravitational acceleration is taken as a point of
reference before comparing the acceleration to the
specified activity and inactivity magnitude.

For example, in the case of the ADXL367, for activity
detection, the formula is:

abs(acceleration - reference) > magnitude

Add a new event type that makes this behavior clear.

Signed-off-by: Cosmin Tanislav <cosmin.tanislav@analog.com>
Link: https://lore.kernel.org/r/20220214073810.781016-2-cosmin.tanislav@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-event.c | 1 +
 include/uapi/linux/iio/types.h   | 1 +
 tools/iio/iio_event_monitor.c    | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index d0732eac0f0a..ce8b102ce52f 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -230,6 +230,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_CHANGE] = "change",
+	[IIO_EV_TYPE_MAG_REFERENCED] = "mag_referenced",
 };
 
 static const char * const iio_ev_dir_text[] = {
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 48c13147c0a8..472cead10d8d 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -104,6 +104,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_THRESH_ADAPTIVE,
 	IIO_EV_TYPE_MAG_ADAPTIVE,
 	IIO_EV_TYPE_CHANGE,
+	IIO_EV_TYPE_MAG_REFERENCED,
 };
 
 enum iio_event_direction {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index b94a16ba5c6c..2f4581658859 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -68,6 +68,7 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_THRESH_ADAPTIVE] = "thresh_adaptive",
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_CHANGE] = "change",
+	[IIO_EV_TYPE_MAG_REFERENCED] = "mag_referenced",
 };
 
 static const char * const iio_ev_dir_text[] = {
-- 
cgit v1.2.3


From 0fbb4d93b38bce1f8235aacfa37e90ad8f011473 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 17 Feb 2022 23:40:32 -0500
Subject: dm: add dm_submit_bio_remap interface

Where possible, switch from early bio-based IO accounting (at the time
DM clones each incoming bio) to late IO accounting just before each
remapped bio is issued to underlying device via submit_bio_noacct().

Allows more precise bio-based IO accounting for DM targets that use
their own workqueues to perform additional processing of each bio in
conjunction with their DM_MAPIO_SUBMITTED return from their map
function. When a target is updated to use dm_submit_bio_remap() they
must also set ti->accounts_remapped_io to true.

Use xchg() in start_io_acct(), as suggested by Mikulas, to ensure each
IO is only started once.  The xchg race only happens if
__send_duplicate_bios() sends multiple bios -- that case is reflected
via tio->is_duplicate_bio.  Given the niche nature of this race, it is
best to avoid any xchg performance penalty for normal IO.

For IO that was never submitted with dm_bio_submit_remap(), but the
target completes the clone with bio_endio, accounting is started then
ended and pending_io counter decremented.

Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |   2 +
 drivers/md/dm.c               | 127 +++++++++++++++++++++++++++++++++++-------
 include/linux/device-mapper.h |   7 +++
 include/uapi/linux/dm-ioctl.h |   4 +-
 4 files changed, 118 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 33ef92e90462..8078b6c155ef 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -232,6 +232,8 @@ struct dm_io {
 	struct mapped_device *md;
 	struct bio *orig_bio;
 	blk_status_t status;
+	bool start_io_acct:1;
+	int was_accounted;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 	struct dm_stats_aux stats_aux;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d1c618c3f6c6..082366d0ad49 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -518,14 +518,33 @@ static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
 		bio->bi_iter.bi_size = bi_size;
 }
 
-static void dm_start_io_acct(struct dm_io *io)
+static void __dm_start_io_acct(struct dm_io *io, struct bio *bio)
 {
-	dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux);
+	dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux);
 }
 
-static void dm_end_io_acct(struct dm_io *io)
+static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
 {
-	dm_io_acct(true, io->md, io->orig_bio, io->start_time, &io->stats_aux);
+	/* Must account IO to DM device in terms of orig_bio */
+	struct bio *bio = io->orig_bio;
+
+	/*
+	 * Ensure IO accounting is only ever started once.
+	 * Expect no possibility for race unless is_duplicate_bio.
+	 */
+	if (!clone || likely(!clone_to_tio(clone)->is_duplicate_bio)) {
+		if (WARN_ON(io->was_accounted))
+			return;
+		io->was_accounted = 1;
+	} else if (xchg(&io->was_accounted, 1) == 1)
+		return;
+
+	__dm_start_io_acct(io, bio);
+}
+
+static void dm_end_io_acct(struct dm_io *io, struct bio *bio)
+{
+	dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux);
 }
 
 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
@@ -545,11 +564,13 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	io->status = 0;
 	atomic_set(&io->io_count, 1);
 	this_cpu_inc(*md->pending_io);
-	io->orig_bio = bio;
+	io->orig_bio = NULL;
 	io->md = md;
 	spin_lock_init(&io->endio_lock);
 
 	io->start_time = jiffies;
+	io->start_io_acct = false;
+	io->was_accounted = 0;
 
 	dm_stats_record_start(&md->stats, &io->stats_aux);
 
@@ -849,7 +870,16 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
 		}
 
 		io_error = io->status;
-		dm_end_io_acct(io);
+		if (io->was_accounted)
+			dm_end_io_acct(io, bio);
+		else if (!io_error) {
+			/*
+			 * Must handle target that DM_MAPIO_SUBMITTED only to
+			 * then bio_endio() rather than dm_submit_bio_remap()
+			 */
+			__dm_start_io_acct(io, bio);
+			dm_end_io_acct(io, bio);
+		}
 		free_io(io);
 		smp_wmb();
 		this_cpu_dec(*md->pending_io);
@@ -1131,6 +1161,56 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 }
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
+static inline void __dm_submit_bio_remap(struct bio *clone,
+					 dev_t dev, sector_t old_sector)
+{
+	trace_block_bio_remap(clone, dev, old_sector);
+	submit_bio_noacct(clone);
+}
+
+/*
+ * @clone: clone bio that DM core passed to target's .map function
+ * @tgt_clone: clone of @clone bio that target needs submitted
+ * @from_wq: caller is a workqueue thread managed by DM target
+ *
+ * Targets should use this interface to submit bios they take
+ * ownership of when returning DM_MAPIO_SUBMITTED.
+ *
+ * Target should also enable ti->accounts_remapped_io
+ */
+void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone,
+			 bool from_wq)
+{
+	struct dm_target_io *tio = clone_to_tio(clone);
+	struct dm_io *io = tio->io;
+
+	/* establish bio that will get submitted */
+	if (!tgt_clone)
+		tgt_clone = clone;
+
+	/*
+	 * Account io->origin_bio to DM dev on behalf of target
+	 * that took ownership of IO with DM_MAPIO_SUBMITTED.
+	 */
+	if (!from_wq) {
+		/* Still in target's map function */
+		io->start_io_acct = true;
+	} else {
+		/*
+		 * Called by another thread, managed by DM target,
+		 * wait for dm_split_and_process_bio() to store
+		 * io->orig_bio
+		 */
+		while (unlikely(!smp_load_acquire(&io->orig_bio)))
+			msleep(1);
+		dm_start_io_acct(io, clone);
+	}
+
+	__dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk),
+			      tio->old_sector);
+}
+EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
+
 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
 {
 	mutex_lock(&md->swap_bios_lock);
@@ -1157,9 +1237,7 @@ static void __map_bio(struct bio *clone)
 	clone->bi_end_io = clone_endio;
 
 	/*
-	 * Map the clone.  If r == 0 we don't need to do
-	 * anything, the target has assumed ownership of
-	 * this io.
+	 * Map the clone.
 	 */
 	dm_io_inc_pending(io);
 	tio->old_sector = clone->bi_iter.bi_sector;
@@ -1184,12 +1262,18 @@ static void __map_bio(struct bio *clone)
 
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
+		/* target has assumed ownership of this io */
+		if (!ti->accounts_remapped_io)
+			io->start_io_acct = true;
 		break;
 	case DM_MAPIO_REMAPPED:
-		/* the bio has been remapped so dispatch it */
-		trace_block_bio_remap(clone, bio_dev(io->orig_bio),
+		/*
+		 * the bio has been remapped so dispatch it, but defer
+		 * dm_start_io_acct() until after possible bio_split().
+		 */
+		__dm_submit_bio_remap(clone, disk_devt(io->md->disk),
 				      tio->old_sector);
-		submit_bio_noacct(clone);
+		io->start_io_acct = true;
 		break;
 	case DM_MAPIO_KILL:
 	case DM_MAPIO_REQUEUE:
@@ -1404,7 +1488,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 				     struct dm_table *map, struct bio *bio)
 {
 	struct clone_info ci;
-	struct bio *b;
+	struct bio *orig_bio = NULL;
 	int error = 0;
 
 	init_clone_info(&ci, md, map, bio);
@@ -1426,15 +1510,18 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	 * used by dm_end_io_acct() and for dm_io_dec_pending() to use for
 	 * completion handling.
 	 */
-	b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
-		      GFP_NOIO, &md->queue->bio_split);
-	ci.io->orig_bio = b;
-
-	bio_chain(b, bio);
-	trace_block_split(b, bio->bi_iter.bi_sector);
+	orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count,
+			     GFP_NOIO, &md->queue->bio_split);
+	bio_chain(orig_bio, bio);
+	trace_block_split(orig_bio, bio->bi_iter.bi_sector);
 	submit_bio_noacct(bio);
 out:
-	dm_start_io_acct(ci.io);
+	if (!orig_bio)
+		orig_bio = bio;
+	smp_store_release(&ci.io->orig_bio, orig_bio);
+	if (ci.io->start_io_acct)
+		dm_start_io_acct(ci.io, NULL);
+
 	/* drop the extra reference count */
 	dm_io_dec_pending(ci.io, errno_to_blk_status(error));
 }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b26fecf6c8e8..7752d14d13f8 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -362,6 +362,12 @@ struct dm_target {
 	 * zone append operations using regular writes.
 	 */
 	bool emulate_zone_append:1;
+
+	/*
+	 * Set if the target will submit IO using dm_submit_bio_remap()
+	 * after returning DM_MAPIO_SUBMITTED from its map function.
+	 */
+	bool accounts_remapped_io:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);
@@ -465,6 +471,7 @@ int dm_suspended(struct dm_target *ti);
 int dm_post_suspending(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors);
+void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone, bool from_wq);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 #ifdef CONFIG_BLK_DEV_ZONED
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index c12ce30b52df..2e9550fef90f 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -286,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	45
+#define DM_VERSION_MINOR	46
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2021-03-22)"
+#define DM_VERSION_EXTRA	"-ioctl (2022-02-22)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 93b71801a8274cd9511557faf04365a5de487197 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 22 Feb 2022 09:06:54 -0500
Subject: KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3

Add KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL
resource mode to 3 with the H_SET_MODE hypercall. This capability
differs between processor types and KVM types (PR, HV, Nested HV), and
affects guest-visible behaviour.

QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and
use the KVM CAP if available to determine KVM support[2].

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 14 ++++++++++++++
 include/uapi/linux/kvm.h       |  1 +
 tools/include/uapi/linux/kvm.h |  1 +
 3 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a4267104db50..9954568c7eab 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6997,6 +6997,20 @@ indicated by the fd to the VM this is called on.
 This is intended to support intra-host migration of VMs between userspace VMMs,
 upgrading the VMM process without interrupting the guest.
 
+7.30 KVM_CAP_PPC_AIL_MODE_3
+-------------------------------
+
+:Capability: KVM_CAP_PPC_AIL_MODE_3
+:Architectures: ppc
+:Type: vm
+
+This capability indicates that the kernel supports the mode 3 setting for the
+"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location"
+resource that is controlled with the H_SET_MODE hypercall.
+
+This capability allows a guest kernel to use a better-performance mode for
+handling interrupts and system calls.
+
 8. Other capabilities.
 ======================
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5191b57e1562..507ee1f2aa96 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
 #define KVM_CAP_SYS_ATTRIBUTES 209
+#define KVM_CAP_PPC_AIL_MODE_3 210
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 5191b57e1562..507ee1f2aa96 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
 #define KVM_CAP_SYS_ATTRIBUTES 209
+#define KVM_CAP_PPC_AIL_MODE_3 210
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From a21d9a670d81103db7f788de1a4a4a6e4b891a0b Mon Sep 17 00:00:00 2001
From: Hans Schultz <schultz.hans@gmail.com>
Date: Wed, 23 Feb 2022 11:16:46 +0100
Subject: net: bridge: Add support for bridge port in locked mode

In a 802.1X scenario, clients connected to a bridge port shall not
be allowed to have traffic forwarded until fully authenticated.
A static fdb entry of the clients MAC address for the bridge port
unlocks the client and allows bidirectional communication.

This scenario is facilitated with setting the bridge port in locked
mode, which is also supported by various switchcore chipsets.

Signed-off-by: Hans Schultz <schultz.hans+netdev@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_input.c        | 11 ++++++++++-
 net/bridge/br_netlink.c      |  6 +++++-
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 509e18c7e740..3aae023a9353 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -58,6 +58,7 @@ struct br_ip_list {
 #define BR_MRP_LOST_CONT	BIT(18)
 #define BR_MRP_LOST_IN_CONT	BIT(19)
 #define BR_TX_FWD_OFFLOAD	BIT(20)
+#define BR_PORT_LOCKED		BIT(21)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e1ba2d51b717..be09d2ad4b5d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -537,6 +537,7 @@ enum {
 	IFLA_BRPORT_MRP_IN_OPEN,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
 	IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
+	IFLA_BRPORT_LOCKED,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b50382f957c1..e0c13fcc50ed 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -81,6 +81,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	if (!p || p->state == BR_STATE_DISABLED)
 		goto drop;
 
+	br = p->br;
 	brmctx = &p->br->multicast_ctx;
 	pmctx = &p->multicast_ctx;
 	state = p->state;
@@ -88,10 +89,18 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 				&state, &vlan))
 		goto out;
 
+	if (p->flags & BR_PORT_LOCKED) {
+		struct net_bridge_fdb_entry *fdb_src =
+			br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
+
+		if (!fdb_src || READ_ONCE(fdb_src->dst) != p ||
+		    test_bit(BR_FDB_LOCAL, &fdb_src->flags))
+			goto drop;
+	}
+
 	nbp_switchdev_frame_mark(p, skb);
 
 	/* insert into forwarding database after filtering to avoid spoofing */
-	br = p->br;
 	if (p->flags & BR_LEARNING)
 		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 2ff83d84230d..7d4432ca9a20 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -184,6 +184,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
+		+ nla_total_size(1)	/* IFLA_BRPORT_LOCKED */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -269,7 +270,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 							  BR_MRP_LOST_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
+	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -827,6 +829,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
 	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_LOCKED] = { .type = NLA_U8 },
 	[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
 };
@@ -893,6 +896,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
 	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+	br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
 
 	changed_mask = old_flags ^ p->flags;
 
-- 
cgit v1.2.3


From 1241e329ce2e1f5b1039fd356b75867b29721ad2 Mon Sep 17 00:00:00 2001
From: Subbaraya Sundeep <sbhatta@marvell.com>
Date: Wed, 23 Feb 2022 00:09:12 +0530
Subject: ethtool: add support to set/get completion queue event size

Add support to set completion queue event size via ethtool -G
parameter and get it via ethtool -g parameter.

~ # ./ethtool -G eth0 cqe-size 512
~ # ./ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX:             1048576
RX Mini:        n/a
RX Jumbo:       n/a
TX:             1048576
Current hardware settings:
RX:             256
RX Mini:        n/a
RX Jumbo:       n/a
TX:             4096
RX Buf Len:             2048
CQE Size:                128

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 11 +++++++++++
 include/linux/ethtool.h                      |  4 ++++
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/netlink.h                        |  2 +-
 net/ethtool/rings.c                          | 19 +++++++++++++++++--
 5 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index cae28af7a476..24d9be69065d 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -861,6 +861,7 @@ Kernel response contents:
   ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
   ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
+  ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
   ====================================  ======  ===========================
 
 ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
@@ -885,6 +886,7 @@ Request contents:
   ``ETHTOOL_A_RINGS_RX_JUMBO``          u32     size of RX jumbo ring
   ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
+  ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
   ====================================  ======  ===========================
 
 Kernel checks that requested ring sizes do not exceed limits reported by
@@ -892,6 +894,15 @@ driver. Driver may impose additional constraints and may not suspport all
 attributes.
 
 
+``ETHTOOL_A_RINGS_CQE_SIZE`` specifies the completion queue event size.
+Completion queue events(CQE) are the events posted by NIC to indicate the
+completion status of a packet when the packet is sent(like send success or
+error) or received(like pointers to packet fragments). The CQE size parameter
+enables to modify the CQE size other than default size if NIC supports it.
+A bigger CQE can have more receive buffer pointers inturn NIC can transfer
+a bigger frame from wire. Based on the NIC hardware, the overall completion
+queue size can be adjusted in the driver if CQE size is modified.
+
 CHANNELS_GET
 ============
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e0853f48b75e..4af58459a1e7 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -71,18 +71,22 @@ enum {
  * struct kernel_ethtool_ringparam - RX/TX ring configuration
  * @rx_buf_len: Current length of buffers on the rx ring.
  * @tcp_data_split: Scatter packet headers and data to separate buffers
+ * @cqe_size: Size of TX/RX completion queue event
  */
 struct kernel_ethtool_ringparam {
 	u32	rx_buf_len;
 	u8	tcp_data_split;
+	u32	cqe_size;
 };
 
 /**
  * enum ethtool_supported_ring_param - indicator caps for setting ring params
  * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len
+ * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size
  */
 enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0),
+	ETHTOOL_RING_USE_CQE_SIZE   = BIT(1),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 417d4280d7b5..979850221b8d 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -337,6 +337,7 @@ enum {
 	ETHTOOL_A_RINGS_TX,				/* u32 */
 	ETHTOOL_A_RINGS_RX_BUF_LEN,                     /* u32 */
 	ETHTOOL_A_RINGS_TCP_DATA_SPLIT,			/* u8 */
+	ETHTOOL_A_RINGS_CQE_SIZE,			/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_RINGS_CNT,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 75856db299e9..29d01662a48b 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -363,7 +363,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT
 extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
 extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
 extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
-extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_BUF_LEN + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_CQE_SIZE + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index 18a5035d3bee..9f33c9689b56 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -54,7 +54,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_JUMBO */
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_TX */
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_BUF_LEN */
-	       nla_total_size(sizeof(u8));	/* _RINGS_TCP_DATA_SPLIT */
+	       nla_total_size(sizeof(u8))  +	/* _RINGS_TCP_DATA_SPLIT */
+	       nla_total_size(sizeof(u32));	/* _RINGS_CQE_SIZE */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -91,7 +92,9 @@ static int rings_fill_reply(struct sk_buff *skb,
 	     (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
 	    (kr->tcp_data_split &&
 	     (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
-			 kr->tcp_data_split))))
+			 kr->tcp_data_split))) ||
+	    (kr->cqe_size &&
+	     (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))))
 		return -EMSGSIZE;
 
 	return 0;
@@ -119,6 +122,7 @@ const struct nla_policy ethnl_rings_set_policy[] = {
 	[ETHTOOL_A_RINGS_RX_JUMBO]		= { .type = NLA_U32 },
 	[ETHTOOL_A_RINGS_TX]			= { .type = NLA_U32 },
 	[ETHTOOL_A_RINGS_RX_BUF_LEN]            = NLA_POLICY_MIN(NLA_U32, 1),
+	[ETHTOOL_A_RINGS_CQE_SIZE]		= NLA_POLICY_MIN(NLA_U32, 1),
 };
 
 int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
@@ -159,6 +163,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
 	ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod);
 	ethnl_update_u32(&kernel_ringparam.rx_buf_len,
 			 tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod);
+	ethnl_update_u32(&kernel_ringparam.cqe_size,
+			 tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod);
 	ret = 0;
 	if (!mod)
 		goto out_ops;
@@ -190,6 +196,15 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
 		goto out_ops;
 	}
 
+	if (kernel_ringparam.cqe_size &&
+	    !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) {
+		ret = -EOPNOTSUPP;
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_RINGS_CQE_SIZE],
+				    "setting cqe size not supported");
+		goto out_ops;
+	}
+
 	ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
 					      &kernel_ringparam, info->extack);
 	if (ret < 0)
-- 
cgit v1.2.3


From 28a3f0601727d521a1c6cce62ecbcb7402a9e4f5 Mon Sep 17 00:00:00 2001
From: Toms Atteka <cpp.code.lv@gmail.com>
Date: Wed, 23 Feb 2022 16:54:09 -0800
Subject: net: openvswitch: IPv6: Add IPv6 extension header support

This change adds a new OpenFlow field OFPXMT_OFB_IPV6_EXTHDR and
packets can be filtered using ipv6_ext flag.

Signed-off-by: Toms Atteka <cpp.code.lv@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |   6 ++
 net/openvswitch/flow.c           | 140 +++++++++++++++++++++++++++++++++++++++
 net/openvswitch/flow.h           |  14 ++++
 net/openvswitch/flow_netlink.c   |  26 +++++++-
 4 files changed, 184 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 150bcff49b1c..9d1710f20505 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -351,6 +351,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
 	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
+	OVS_KEY_ATTR_IPV6_EXTHDRS,  /* struct ovs_key_ipv6_exthdr */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -430,6 +431,11 @@ struct ovs_key_ipv6 {
 	__u8   ipv6_frag;	/* One of OVS_FRAG_TYPE_*. */
 };
 
+/* separate structure to support backward compatibility with older user space */
+struct ovs_key_ipv6_exthdrs {
+	__u16  hdrs;
+};
+
 struct ovs_key_tcp {
 	__be16 tcp_src;
 	__be16 tcp_dst;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index f6cd24fd530c..8df73d86b968 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -241,6 +241,144 @@ static bool icmphdr_ok(struct sk_buff *skb)
 				  sizeof(struct icmphdr));
 }
 
+/**
+ * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags.
+ *
+ * @skb: buffer where extension header data starts in packet
+ * @nh: ipv6 header
+ * @ext_hdrs: flags are stored here
+ *
+ * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header
+ * is unexpectedly encountered. (Two destination options headers may be
+ * expected and would not cause this bit to be set.)
+ *
+ * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order
+ * preferred (but not required) by RFC 2460:
+ *
+ * When more than one extension header is used in the same packet, it is
+ * recommended that those headers appear in the following order:
+ *      IPv6 header
+ *      Hop-by-Hop Options header
+ *      Destination Options header
+ *      Routing header
+ *      Fragment header
+ *      Authentication header
+ *      Encapsulating Security Payload header
+ *      Destination Options header
+ *      upper-layer header
+ */
+static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh,
+			      u16 *ext_hdrs)
+{
+	u8 next_type = nh->nexthdr;
+	unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+	int dest_options_header_count = 0;
+
+	*ext_hdrs = 0;
+
+	while (ipv6_ext_hdr(next_type)) {
+		struct ipv6_opt_hdr _hdr, *hp;
+
+		switch (next_type) {
+		case IPPROTO_NONE:
+			*ext_hdrs |= OFPIEH12_NONEXT;
+			/* stop parsing */
+			return;
+
+		case IPPROTO_ESP:
+			if (*ext_hdrs & OFPIEH12_ESP)
+				*ext_hdrs |= OFPIEH12_UNREP;
+			if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST |
+					   OFPIEH12_ROUTER | IPPROTO_FRAGMENT |
+					   OFPIEH12_AUTH | OFPIEH12_UNREP)) ||
+			    dest_options_header_count >= 2) {
+				*ext_hdrs |= OFPIEH12_UNSEQ;
+			}
+			*ext_hdrs |= OFPIEH12_ESP;
+			break;
+
+		case IPPROTO_AH:
+			if (*ext_hdrs & OFPIEH12_AUTH)
+				*ext_hdrs |= OFPIEH12_UNREP;
+			if ((*ext_hdrs &
+			     ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER |
+			       IPPROTO_FRAGMENT | OFPIEH12_UNREP)) ||
+			    dest_options_header_count >= 2) {
+				*ext_hdrs |= OFPIEH12_UNSEQ;
+			}
+			*ext_hdrs |= OFPIEH12_AUTH;
+			break;
+
+		case IPPROTO_DSTOPTS:
+			if (dest_options_header_count == 0) {
+				if (*ext_hdrs &
+				    ~(OFPIEH12_HOP | OFPIEH12_UNREP))
+					*ext_hdrs |= OFPIEH12_UNSEQ;
+				*ext_hdrs |= OFPIEH12_DEST;
+			} else if (dest_options_header_count == 1) {
+				if (*ext_hdrs &
+				    ~(OFPIEH12_HOP | OFPIEH12_DEST |
+				      OFPIEH12_ROUTER | OFPIEH12_FRAG |
+				      OFPIEH12_AUTH | OFPIEH12_ESP |
+				      OFPIEH12_UNREP)) {
+					*ext_hdrs |= OFPIEH12_UNSEQ;
+				}
+			} else {
+				*ext_hdrs |= OFPIEH12_UNREP;
+			}
+			dest_options_header_count++;
+			break;
+
+		case IPPROTO_FRAGMENT:
+			if (*ext_hdrs & OFPIEH12_FRAG)
+				*ext_hdrs |= OFPIEH12_UNREP;
+			if ((*ext_hdrs & ~(OFPIEH12_HOP |
+					   OFPIEH12_DEST |
+					   OFPIEH12_ROUTER |
+					   OFPIEH12_UNREP)) ||
+			    dest_options_header_count >= 2) {
+				*ext_hdrs |= OFPIEH12_UNSEQ;
+			}
+			*ext_hdrs |= OFPIEH12_FRAG;
+			break;
+
+		case IPPROTO_ROUTING:
+			if (*ext_hdrs & OFPIEH12_ROUTER)
+				*ext_hdrs |= OFPIEH12_UNREP;
+			if ((*ext_hdrs & ~(OFPIEH12_HOP |
+					   OFPIEH12_DEST |
+					   OFPIEH12_UNREP)) ||
+			    dest_options_header_count >= 2) {
+				*ext_hdrs |= OFPIEH12_UNSEQ;
+			}
+			*ext_hdrs |= OFPIEH12_ROUTER;
+			break;
+
+		case IPPROTO_HOPOPTS:
+			if (*ext_hdrs & OFPIEH12_HOP)
+				*ext_hdrs |= OFPIEH12_UNREP;
+			/* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6
+			 * extension header is present as the first
+			 * extension header in the packet.
+			 */
+			if (*ext_hdrs == 0)
+				*ext_hdrs |= OFPIEH12_HOP;
+			else
+				*ext_hdrs |= OFPIEH12_UNSEQ;
+			break;
+
+		default:
+			return;
+		}
+
+		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+		if (!hp)
+			break;
+		next_type = hp->nexthdr;
+		start += ipv6_optlen(hp);
+	};
+}
+
 static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 {
 	unsigned short frag_off;
@@ -256,6 +394,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 
 	nh = ipv6_hdr(skb);
 
+	get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs);
+
 	key->ip.proto = NEXTHDR_NONE;
 	key->ip.tos = ipv6_get_dsfield(nh);
 	key->ip.ttl = nh->hop_limit;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 758a8c77f736..073ab73ffeaa 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -32,6 +32,19 @@ enum sw_flow_mac_proto {
 #define SW_FLOW_KEY_INVALID	0x80
 #define MPLS_LABEL_DEPTH       3
 
+/* Bit definitions for IPv6 Extension Header pseudo-field. */
+enum ofp12_ipv6exthdr_flags {
+	OFPIEH12_NONEXT = 1 << 0,   /* "No next header" encountered. */
+	OFPIEH12_ESP    = 1 << 1,   /* Encrypted Sec Payload header present. */
+	OFPIEH12_AUTH   = 1 << 2,   /* Authentication header present. */
+	OFPIEH12_DEST   = 1 << 3,   /* 1 or 2 dest headers present. */
+	OFPIEH12_FRAG   = 1 << 4,   /* Fragment header present. */
+	OFPIEH12_ROUTER = 1 << 5,   /* Router header present. */
+	OFPIEH12_HOP    = 1 << 6,   /* Hop-by-hop header present. */
+	OFPIEH12_UNREP  = 1 << 7,   /* Unexpected repeats encountered. */
+	OFPIEH12_UNSEQ  = 1 << 8    /* Unexpected sequencing encountered. */
+};
+
 /* Store options at the end of the array if they are less than the
  * maximum size. This allows us to get the benefits of variable length
  * matching for small options.
@@ -121,6 +134,7 @@ struct sw_flow_key {
 				struct in6_addr dst;	/* IPv6 destination address. */
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
+			u16 exthdrs;	/* IPv6 extension header flags */
 			union {
 				struct {
 					struct in6_addr src;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fd1f809e9bc1..8b4124820f7d 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -346,7 +346,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 30);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -369,7 +369,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_IPV6 */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ICMPV6 */
-		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
+		+ nla_total_size(28)  /* OVS_KEY_ATTR_ND */
+		+ nla_total_size(2);  /* OVS_KEY_ATTR_IPV6_EXTHDRS */
 }
 
 static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = {
@@ -437,6 +438,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
 	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
 				     .next = ovs_nsh_key_attr_lens, },
+	[OVS_KEY_ATTR_IPV6_EXTHDRS] = {
+		.len = sizeof(struct ovs_key_ipv6_exthdrs) },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1597,6 +1600,17 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
 	}
 
+	if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) {
+		const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
+
+		ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]);
+
+		SW_FLOW_KEY_PUT(match, ipv6.exthdrs,
+				ipv6_exthdrs_key->hdrs, is_mask);
+
+		attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
 		const struct ovs_key_arp *arp_key;
 
@@ -2099,6 +2113,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv4_key->ipv4_frag = output->ip.frag;
 	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
 		struct ovs_key_ipv6 *ipv6_key;
+		struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key;
 
 		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
 		if (!nla)
@@ -2113,6 +2128,13 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+
+		nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS,
+				  sizeof(*ipv6_exthdrs_key));
+		if (!nla)
+			goto nla_put_failure;
+		ipv6_exthdrs_key = nla_data(nla);
+		ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs;
 	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
 		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
 			goto nla_put_failure;
-- 
cgit v1.2.3


From ba7bb663f5547ef474c98df99a97bb4a13c5715f Mon Sep 17 00:00:00 2001
From: David Dunn <daviddunn@google.com>
Date: Wed, 23 Feb 2022 22:57:41 +0000
Subject: KVM: x86: Provide per VM capability for disabling PMU virtualization

Add a new capability, KVM_CAP_PMU_CAPABILITY, that takes a bitmask of
settings/features to allow userspace to configure PMU virtualization on
a per-VM basis.  For now, support a single flag, KVM_PMU_CAP_DISABLE,
to allow disabling PMU virtualization for a VM even when KVM is configured
with enable_pmu=true a module level.

To keep KVM simple, disallow changing VM's PMU configuration after vCPUs
have been created.

Signed-off-by: David Dunn <daviddunn@google.com>
Message-Id: <20220223225743.2703915-2-daviddunn@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 22 ++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm/pmu.c          |  2 +-
 arch/x86/kvm/vmx/pmu_intel.c    |  2 +-
 arch/x86/kvm/x86.c              | 18 ++++++++++++++++++
 include/uapi/linux/kvm.h        |  3 +++
 tools/include/uapi/linux/kvm.h  |  4 ++++
 7 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c3e68c1531f0..f5d011351016 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7643,3 +7643,25 @@ The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset
 of the result of KVM_CHECK_EXTENSION.  KVM will forward to userspace
 the hypercalls whose corresponding bit is in the argument, and return
 ENOSYS for the others.
+
+8.35 KVM_CAP_PMU_CAPABILITY
+---------------------------
+
+:Capability KVM_CAP_PMU_CAPABILITY
+:Architectures: x86
+:Type: vm
+:Parameters: arg[0] is bitmask of PMU virtualization capabilities.
+:Returns 0 on success, -EINVAL when arg[0] contains invalid bits
+
+This capability alters PMU virtualization in KVM.
+
+Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of
+PMU virtualization capabilities that can be adjusted on a VM.
+
+The argument to KVM_ENABLE_CAP is also a bitmask and selects specific
+PMU virtualization capabilities to be applied to the VM.  This can
+only be invoked on a VM prior to the creation of VCPUs.
+
+At this time, KVM_PMU_CAP_DISABLE is the only capability.  Setting
+this capability will disable PMU virtualization for that VM.  Usermode
+should adjust CPUID leaf 0xA to reflect that the PMU is disabled.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 463171e3b613..dafb5a6220cd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1147,6 +1147,7 @@ struct kvm_arch {
 	bool exception_payload_enabled;
 
 	bool bus_lock_detection_enabled;
+	bool enable_pmu;
 	/*
 	 * If exit_on_emulation_error is set, and the in-kernel instruction
 	 * emulator fails to emulate an instruction, allow userspace
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 5aa45f13b16d..d4de52409335 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
 {
 	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
 
-	if (!enable_pmu)
+	if (!vcpu->kvm->arch.enable_pmu)
 		return NULL;
 
 	switch (msr) {
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 03fab48b149c..4e5b1eeeb77c 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -487,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 	pmu->reserved_bits = 0xffffffff00200000ull;
 
 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
-	if (!entry || !enable_pmu)
+	if (!entry || !vcpu->kvm->arch.enable_pmu)
 		return;
 	eax.full = entry->eax;
 	edx.full = entry->edx;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 16d29d41908f..0b95c379e234 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -110,6 +110,8 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
 
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
@@ -4330,6 +4332,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		if (r < sizeof(struct kvm_xsave))
 			r = sizeof(struct kvm_xsave);
 		break;
+	case KVM_CAP_PMU_CAPABILITY:
+		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+		break;
 	}
 	default:
 		break;
@@ -6004,6 +6009,18 @@ split_irqchip_unlock:
 		kvm->arch.exit_on_emulation_error = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_PMU_CAPABILITY:
+		r = -EINVAL;
+		if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+			break;
+
+		mutex_lock(&kvm->lock);
+		if (!kvm->created_vcpus) {
+			kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -11586,6 +11603,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
 	kvm->arch.guest_can_read_msr_platform_info = true;
+	kvm->arch.enable_pmu = enable_pmu;
 
 #if IS_ENABLED(CONFIG_HYPERV)
 	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a02bbf8fd0f6..d2f1efc3aa35 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1142,6 +1142,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SYS_ATTRIBUTES 209
 #define KVM_CAP_PPC_AIL_MODE_3 210
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
+#define KVM_CAP_PMU_CAPABILITY 212
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1978,6 +1979,8 @@ struct kvm_dirty_gfn {
 #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
 #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
 
+#define KVM_PMU_CAP_DISABLE                    (1 << 0)
+
 /**
  * struct kvm_stats_header - Header of per vm/vcpu binary statistics data.
  * @flags: Some extra information for header, always 0 for now.
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 507ee1f2aa96..bbc6b7c2dc1b 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1135,6 +1135,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_XSAVE2 208
 #define KVM_CAP_SYS_ATTRIBUTES 209
 #define KVM_CAP_PPC_AIL_MODE_3 210
+#define KVM_CAP_S390_MEM_OP_EXTENSION 211
+#define KVM_CAP_PMU_CAPABILITY 212
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1971,6 +1973,8 @@ struct kvm_dirty_gfn {
 #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
 #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
 
+#define KVM_PMU_CAP_DISABLE                    (1 << 0)
+
 /**
  * struct kvm_stats_header - Header of per vm/vcpu binary statistics data.
  * @flags: Some extra information for header, always 0 for now.
-- 
cgit v1.2.3


From 43245eca6e670ebf65908b549641c1460a9cc944 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 2 Feb 2022 17:55:02 -0500
Subject: NFSv4.1 support for NFS4_RESULT_PRESERVER_UNLINKED

In 4.1+, the server is allowed to set a flag
NFS4_RESULT_PRESERVE_UNLINKED in reply to the OPEN, that tells
the client that it does not need to do a silly rename of an
opened file when it's being removed.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c              | 10 ++++++++--
 fs/nfs/nfs4proc.c         |  2 ++
 include/linux/nfs_fs.h    |  1 +
 include/uapi/linux/nfs4.h |  1 +
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fbb4a522d716..8b190c8e4a45 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1419,7 +1419,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 	if (flags & LOOKUP_REVAL)
 		goto out_force;
 out:
-	return (inode->i_nlink == 0) ? -ESTALE : 0;
+	if (inode->i_nlink > 0 ||
+	    (inode->i_nlink == 0 &&
+	     test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags)))
+		return 0;
+	else
+		return -ESTALE;
 out_force:
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -2330,7 +2335,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_nfs_unlink_enter(dir, dentry);
 	spin_lock(&dentry->d_lock);
-	if (d_count(dentry) > 1) {
+	if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED,
+					     &NFS_I(d_inode(dentry))->flags)) {
 		spin_unlock(&dentry->d_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(d_inode(dentry), 0);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b3793b82a5e7..73a9b6de666c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3050,6 +3050,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
 	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
 		set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED)
+		set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags);
 
 	dentry = opendata->dentry;
 	if (d_really_is_negative(dentry)) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 333ea05e2531..0e79dbbc759a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -277,6 +277,7 @@ struct nfs4_copy_state {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
+#define NFS_INO_PRESERVE_UNLINKED (4)		/* preserve file if removed while open */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 800bb0ffa6e6..1d2043708bf1 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -45,6 +45,7 @@
 
 #define NFS4_OPEN_RESULT_CONFIRM		0x0002
 #define NFS4_OPEN_RESULT_LOCKTYPE_POSIX		0x0004
+#define NFS4_OPEN_RESULT_PRESERVE_UNLINKED	0x0008
 #define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK	0x0020
 
 #define NFS4_SHARE_ACCESS_MASK	0x000F
-- 
cgit v1.2.3


From 9e8d5470325f25bed7d33f9faaae6d5e4f313650 Mon Sep 17 00:00:00 2001
From: Hammer Hsieh <hammerh0314@gmail.com>
Date: Tue, 22 Feb 2022 17:36:03 +0800
Subject: serial: sunplus-uart: Add Sunplus SoC UART Driver

Add Sunplus SoC UART Driver.
SP7021 UART block contains 5 UARTs.
There are UART0~4 that supported in SP7021, the features list as below.
Support Full-duplex communication.
Support data packet length configurable.
Support stop bit number configurable.
Support force break condition.
Support baud rate configurable.
Support error detection and report.
Support RXD Noise Rejection Vote configurable.

UART0 pinout only support TX/RX two pins.
UART1 to UART4 pinout support TX/RX/CTS/RTS four pins.
Normally UART0 used for kernel console, also can be used for normal uart.
Command line set "console=ttySUP0,115200", SUP means Sunplus Uart Port.
UART driver probe will create path named "/dev/ttySUPx".

https://sunplus.atlassian.net/wiki/spaces/doc/pages/1873412290/13.+Universal+Asynchronous+Receiver+Transmitter+UART

Signed-off-by: Hammer Hsieh <hammerh0314@gmail.com>
Link: https://lore.kernel.org/r/1645522563-17183-3-git-send-email-hammerh0314@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                       |   1 +
 drivers/tty/serial/Kconfig        |  25 ++
 drivers/tty/serial/Makefile       |   1 +
 drivers/tty/serial/sunplus-uart.c | 770 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/serial_core.h  |   3 +
 5 files changed, 800 insertions(+)
 create mode 100644 drivers/tty/serial/sunplus-uart.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5c39f47af44f..4a30001f6d7b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18553,6 +18553,7 @@ SUNPLUS UART DRIVER
 M:	Hammer Hsieh <hammerh0314@gmail.com>
 S:	Maintained
 F:	Documentation/devicetree/bindings/serial/sunplus,sp7021-uart.yaml
+F:	drivers/tty/serial/sunplus-uart.c
 
 SUPERH
 M:	Yoshinori Sato <ysato@users.sourceforge.jp>
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 66916397a19c..e952ec5c7a7c 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1564,6 +1564,31 @@ config SERIAL_LITEUART_CONSOLE
 	  and warnings and which allows logins in single user mode).
 	  Otherwise, say 'N'.
 
+config SERIAL_SUNPLUS
+	tristate "Sunplus UART support"
+	depends on OF || COMPILE_TEST
+	select SERIAL_CORE
+	help
+	  Select this option if you would like to use Sunplus serial port on
+	  Sunplus SoC SP7021.
+	  If you enable this option, Sunplus serial ports in the system will
+	  be registered as ttySUPx.
+	  This driver can also be built as a module. If so, the module will be
+	  called sunplus-uart.
+
+config SERIAL_SUNPLUS_CONSOLE
+	bool "Console on Sunplus UART"
+	depends on SERIAL_SUNPLUS
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
+	help
+	  Select this option if you would like to use a Sunplus UART as the
+	  system console.
+	  Even if you say Y here, the currently visible virtual console
+	  (/dev/tty0) will still be used as the system console by default, but
+	  you can alter that using a kernel command line option such as
+	  "console=ttySUPx".
+
 endmenu
 
 config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 7da0856cd198..61cc8de95571 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_SERIAL_RDA)	+= rda-uart.o
 obj-$(CONFIG_SERIAL_MILBEAUT_USIO) += milbeaut_usio.o
 obj-$(CONFIG_SERIAL_SIFIVE)	+= sifive.o
 obj-$(CONFIG_SERIAL_LITEUART) += liteuart.o
+obj-$(CONFIG_SERIAL_SUNPLUS)	+= sunplus-uart.o
 
 # GPIOLIB helpers for modem control lines
 obj-$(CONFIG_SERIAL_MCTRL_GPIO)	+= serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/sunplus-uart.c b/drivers/tty/serial/sunplus-uart.c
new file mode 100644
index 000000000000..450c8e75abb4
--- /dev/null
+++ b/drivers/tty/serial/sunplus-uart.c
@@ -0,0 +1,770 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sunplus SoC UART driver
+ *
+ * Author: Hammer Hsieh <hammerh0314@gmail.com>
+ *
+ * Note1: This driver is 8250-like uart, but are not register compatible.
+ *
+ * Note2: On some buses, for preventing data incoherence, must do a read
+ * for ensure write made it to hardware. In this driver, function startup
+ * and shutdown did not do a read but only do a write directly. For what?
+ * In Sunplus bus communication between memory bus and peripheral bus with
+ * posted write, it will send a specific command after last write command
+ * to make sure write done. Then memory bus identify the specific command
+ * and send done signal back to master device. After master device received
+ * done signal, then proceed next write command. It is no need to do a read
+ * before write.
+ */
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/serial_core.h>
+#include <linux/serial_reg.h>
+#include <linux/sysrq.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <asm/irq.h>
+
+/* Register offsets */
+#define SUP_UART_DATA			0x00
+#define SUP_UART_LSR			0x04
+#define SUP_UART_MSR			0x08
+#define SUP_UART_LCR			0x0C
+#define SUP_UART_MCR			0x10
+#define SUP_UART_DIV_L			0x14
+#define SUP_UART_DIV_H			0x18
+#define SUP_UART_ISC			0x1C
+#define SUP_UART_TX_RESIDUE		0x20
+#define SUP_UART_RX_RESIDUE		0x24
+
+/* Line Status Register bits */
+#define SUP_UART_LSR_BC			BIT(5) /* break condition status */
+#define SUP_UART_LSR_FE			BIT(4) /* frame error status */
+#define SUP_UART_LSR_OE			BIT(3) /* overrun error status */
+#define SUP_UART_LSR_PE			BIT(2) /* parity error status */
+#define SUP_UART_LSR_RX			BIT(1) /* 1: receive fifo not empty */
+#define SUP_UART_LSR_TX			BIT(0) /* 1: transmit fifo is not full */
+#define SUP_UART_LSR_TX_NOT_FULL	1
+#define SUP_UART_LSR_BRK_ERROR_BITS	GENMASK(5, 2)
+
+/* Line Control Register bits */
+#define SUP_UART_LCR_SBC		BIT(5) /* select break condition */
+
+/* Modem Control Register bits */
+#define SUP_UART_MCR_RI			BIT(3) /* ring indicator */
+#define SUP_UART_MCR_DCD		BIT(2) /* data carrier detect */
+
+/* Interrupt Status/Control Register bits */
+#define SUP_UART_ISC_RXM		BIT(5) /* RX interrupt enable */
+#define SUP_UART_ISC_TXM		BIT(4) /* TX interrupt enable */
+#define SUP_UART_ISC_RX			BIT(1) /* RX interrupt status */
+#define SUP_UART_ISC_TX			BIT(0) /* TX interrupt status */
+
+#define SUP_DUMMY_READ			BIT(16) /* drop bytes received on a !CREAD port */
+#define SUP_UART_NR			5
+
+struct sunplus_uart_port {
+	struct uart_port port;
+	struct clk *clk;
+	struct reset_control *rstc;
+};
+
+static void sp_uart_put_char(struct uart_port *port, unsigned int ch)
+{
+	writel(ch, port->membase + SUP_UART_DATA);
+}
+
+static u32 sunplus_tx_buf_not_full(struct uart_port *port)
+{
+	unsigned int lsr = readl(port->membase + SUP_UART_LSR);
+
+	return (lsr & SUP_UART_LSR_TX) ? SUP_UART_LSR_TX_NOT_FULL : 0;
+}
+
+static unsigned int sunplus_tx_empty(struct uart_port *port)
+{
+	unsigned int lsr = readl(port->membase + SUP_UART_LSR);
+
+	return (lsr & UART_LSR_TEMT) ? TIOCSER_TEMT : 0;
+}
+
+static void sunplus_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	unsigned int mcr = readl(port->membase + SUP_UART_MCR);
+
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	else
+		mcr &= ~UART_MCR_DTR;
+
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	else
+		mcr &= ~UART_MCR_RTS;
+
+	if (mctrl & TIOCM_CAR)
+		mcr |= SUP_UART_MCR_DCD;
+	else
+		mcr &= ~SUP_UART_MCR_DCD;
+
+	if (mctrl & TIOCM_RI)
+		mcr |= SUP_UART_MCR_RI;
+	else
+		mcr &= ~SUP_UART_MCR_RI;
+
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+	else
+		mcr &= ~UART_MCR_LOOP;
+
+	writel(mcr, port->membase + SUP_UART_MCR);
+}
+
+static unsigned int sunplus_get_mctrl(struct uart_port *port)
+{
+	unsigned int mcr, ret = 0;
+
+	mcr = readl(port->membase + SUP_UART_MCR);
+
+	if (mcr & UART_MCR_DTR)
+		ret |= TIOCM_DTR;
+
+	if (mcr & UART_MCR_RTS)
+		ret |= TIOCM_RTS;
+
+	if (mcr & SUP_UART_MCR_DCD)
+		ret |= TIOCM_CAR;
+
+	if (mcr & SUP_UART_MCR_RI)
+		ret |= TIOCM_RI;
+
+	if (mcr & UART_MCR_LOOP)
+		ret |= TIOCM_LOOP;
+
+	return ret;
+}
+
+static void sunplus_stop_tx(struct uart_port *port)
+{
+	unsigned int isc;
+
+	isc = readl(port->membase + SUP_UART_ISC);
+	isc &= ~SUP_UART_ISC_TXM;
+	writel(isc, port->membase + SUP_UART_ISC);
+}
+
+static void sunplus_start_tx(struct uart_port *port)
+{
+	unsigned int isc;
+
+	isc = readl(port->membase + SUP_UART_ISC);
+	isc |= SUP_UART_ISC_TXM;
+	writel(isc, port->membase + SUP_UART_ISC);
+}
+
+static void sunplus_stop_rx(struct uart_port *port)
+{
+	unsigned int isc;
+
+	isc = readl(port->membase + SUP_UART_ISC);
+	isc &= ~SUP_UART_ISC_RXM;
+	writel(isc, port->membase + SUP_UART_ISC);
+}
+
+static void sunplus_break_ctl(struct uart_port *port, int ctl)
+{
+	unsigned long flags;
+	unsigned int lcr;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	lcr = readl(port->membase + SUP_UART_LCR);
+
+	if (ctl)
+		lcr |= SUP_UART_LCR_SBC; /* start break */
+	else
+		lcr &= ~SUP_UART_LCR_SBC; /* stop break */
+
+	writel(lcr, port->membase + SUP_UART_LCR);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void transmit_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+
+	if (port->x_char) {
+		sp_uart_put_char(port, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		sunplus_stop_tx(port);
+		return;
+	}
+
+	do {
+		sp_uart_put_char(port, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) % UART_XMIT_SIZE;
+		port->icount.tx++;
+
+		if (uart_circ_empty(xmit))
+			break;
+	} while (sunplus_tx_buf_not_full(port));
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		sunplus_stop_tx(port);
+}
+
+static void receive_chars(struct uart_port *port)
+{
+	unsigned int lsr = readl(port->membase + SUP_UART_LSR);
+	unsigned int ch, flag;
+
+	do {
+		ch = readl(port->membase + SUP_UART_DATA);
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+
+		if (unlikely(lsr & SUP_UART_LSR_BRK_ERROR_BITS)) {
+			if (lsr & SUP_UART_LSR_BC) {
+				lsr &= ~(SUP_UART_LSR_FE | SUP_UART_LSR_PE);
+				port->icount.brk++;
+				flag = TTY_BREAK;
+				if (uart_handle_break(port))
+					goto ignore_char;
+			} else if (lsr & SUP_UART_LSR_PE) {
+				port->icount.parity++;
+				flag = TTY_PARITY;
+			} else if (lsr & SUP_UART_LSR_FE) {
+				port->icount.frame++;
+				flag = TTY_FRAME;
+			}
+
+			if (lsr & SUP_UART_LSR_OE)
+				port->icount.overrun++;
+		}
+
+		if (port->ignore_status_mask & SUP_DUMMY_READ)
+			goto ignore_char;
+
+		if (uart_handle_sysrq_char(port, ch))
+			goto ignore_char;
+
+		uart_insert_char(port, lsr, SUP_UART_LSR_OE, ch, flag);
+
+ignore_char:
+		lsr = readl(port->membase + SUP_UART_LSR);
+	} while (lsr & SUP_UART_LSR_RX);
+
+	tty_flip_buffer_push(&port->state->port);
+}
+
+static irqreturn_t sunplus_uart_irq(int irq, void *args)
+{
+	struct uart_port *port = args;
+	unsigned int isc;
+
+	spin_lock(&port->lock);
+
+	isc = readl(port->membase + SUP_UART_ISC);
+
+	if (isc & SUP_UART_ISC_RX)
+		receive_chars(port);
+
+	if (isc & SUP_UART_ISC_TX)
+		transmit_chars(port);
+
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static int sunplus_startup(struct uart_port *port)
+{
+	unsigned long flags;
+	unsigned int isc = 0;
+	int ret;
+
+	ret = request_irq(port->irq, sunplus_uart_irq, 0, "sunplus_uart", port);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+	/* isc define Bit[7:4] int setting, Bit[3:0] int status
+	 * isc register will clean Bit[3:0] int status after read
+	 * only do a write to Bit[7:4] int setting
+	 */
+	isc |= SUP_UART_ISC_RXM;
+	writel(isc, port->membase + SUP_UART_ISC);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+static void sunplus_shutdown(struct uart_port *port)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	/* isc define Bit[7:4] int setting, Bit[3:0] int status
+	 * isc register will clean Bit[3:0] int status after read
+	 * only do a write to Bit[7:4] int setting
+	 */
+	writel(0, port->membase + SUP_UART_ISC); /* disable all interrupt */
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	free_irq(port->irq, port);
+}
+
+static void sunplus_set_termios(struct uart_port *port,
+				struct ktermios *termios,
+				struct ktermios *oldtermios)
+{
+	u32 ext, div, div_l, div_h, baud, lcr;
+	u32 clk = port->uartclk;
+	unsigned long flags;
+
+	baud = uart_get_baud_rate(port, termios, oldtermios, 0, port->uartclk / 16);
+
+	/* baud rate = uartclk / ((16 * divisor + 1) + divisor_ext) */
+	clk += baud >> 1;
+	div = clk / baud;
+	ext = div & 0x0F;
+	div = (div >> 4) - 1;
+	div_l = (div & 0xFF) | (ext << 12);
+	div_h = div >> 8;
+
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		lcr = UART_LCR_WLEN5;
+		break;
+	case CS6:
+		lcr = UART_LCR_WLEN6;
+		break;
+	case CS7:
+		lcr = UART_LCR_WLEN7;
+		break;
+	default:
+		lcr = UART_LCR_WLEN8;
+		break;
+	}
+
+	if (termios->c_cflag & CSTOPB)
+		lcr |= UART_LCR_STOP;
+
+	if (termios->c_cflag & PARENB) {
+		lcr |= UART_LCR_PARITY;
+
+		if (!(termios->c_cflag & PARODD))
+			lcr |= UART_LCR_EPAR;
+	}
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = 0;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= SUP_UART_LSR_PE | SUP_UART_LSR_FE;
+
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		port->read_status_mask |= SUP_UART_LSR_BC;
+
+	/* Characters to ignore */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= SUP_UART_LSR_FE | SUP_UART_LSR_PE;
+
+	if (termios->c_iflag & IGNBRK) {
+		port->ignore_status_mask |= SUP_UART_LSR_BC;
+
+		if (termios->c_iflag & IGNPAR)
+			port->ignore_status_mask |= SUP_UART_LSR_OE;
+	}
+
+	/* Ignore all characters if CREAD is not set */
+	if ((termios->c_cflag & CREAD) == 0) {
+		port->ignore_status_mask |= SUP_DUMMY_READ;
+		/* flush rx data FIFO */
+		writel(0, port->membase + SUP_UART_RX_RESIDUE);
+	}
+
+	/* Settings for baud rate divisor and lcr */
+	writel(div_h, port->membase + SUP_UART_DIV_H);
+	writel(div_l, port->membase + SUP_UART_DIV_L);
+	writel(lcr, port->membase + SUP_UART_LCR);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void sunplus_set_ldisc(struct uart_port *port, struct ktermios *termios)
+{
+	int new = termios->c_line;
+
+	if (new == N_PPS)
+		port->flags |= UPF_HARDPPS_CD;
+	else
+		port->flags &= ~UPF_HARDPPS_CD;
+}
+
+static const char *sunplus_type(struct uart_port *port)
+{
+	return port->type == PORT_SUNPLUS ? "sunplus_uart" : NULL;
+}
+
+static void sunplus_config_port(struct uart_port *port, int type)
+{
+	if (type & UART_CONFIG_TYPE)
+		port->type = PORT_SUNPLUS;
+}
+
+static int sunplus_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_SUNPLUS)
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_SERIAL_SUNPLUS_CONSOLE
+static void wait_for_xmitr(struct uart_port *port)
+{
+	unsigned int val;
+	int ret;
+
+	/* Wait while FIFO is full or timeout */
+	ret = readl_poll_timeout_atomic(port->membase + SUP_UART_LSR, val,
+					(val & SUP_UART_LSR_TX), 1, 10000);
+
+	if (ret == -ETIMEDOUT) {
+		dev_err(port->dev, "Timeout waiting while UART TX FULL\n");
+		return;
+	}
+}
+#endif
+
+#ifdef CONFIG_CONSOLE_POLL
+static void sunplus_poll_put_char(struct uart_port *port, unsigned char data)
+{
+	wait_for_xmitr(port);
+	sp_uart_put_char(port, data);
+}
+
+static int sunplus_poll_get_char(struct uart_port *port)
+{
+	unsigned int lsr = readl(port->membase + SUP_UART_LSR);
+
+	if (!(lsr & SUP_UART_LSR_RX))
+		return NO_POLL_CHAR;
+
+	return readl(port->membase + SUP_UART_DATA);
+}
+#endif
+
+static const struct uart_ops sunplus_uart_ops = {
+	.tx_empty	= sunplus_tx_empty,
+	.set_mctrl	= sunplus_set_mctrl,
+	.get_mctrl	= sunplus_get_mctrl,
+	.stop_tx	= sunplus_stop_tx,
+	.start_tx	= sunplus_start_tx,
+	.stop_rx	= sunplus_stop_rx,
+	.break_ctl	= sunplus_break_ctl,
+	.startup	= sunplus_startup,
+	.shutdown	= sunplus_shutdown,
+	.set_termios	= sunplus_set_termios,
+	.set_ldisc	= sunplus_set_ldisc,
+	.type		= sunplus_type,
+	.config_port	= sunplus_config_port,
+	.verify_port	= sunplus_verify_port,
+#ifdef CONFIG_CONSOLE_POLL
+	.poll_put_char	= sunplus_poll_put_char,
+	.poll_get_char	= sunplus_poll_get_char,
+#endif
+};
+
+#ifdef CONFIG_SERIAL_SUNPLUS_CONSOLE
+struct sunplus_uart_port *sunplus_console_ports[SUP_UART_NR];
+
+static void sunplus_uart_console_putchar(struct uart_port *port, int ch)
+{
+	wait_for_xmitr(port);
+	sp_uart_put_char(port, ch);
+}
+
+static void sunplus_console_write(struct console *co,
+				  const char *s,
+				  unsigned int count)
+{
+	unsigned long flags;
+	int locked = 1;
+
+	local_irq_save(flags);
+
+	if (sunplus_console_ports[co->index]->port.sysrq)
+		locked = 0;
+	else if (oops_in_progress)
+		locked = spin_trylock(&sunplus_console_ports[co->index]->port.lock);
+	else
+		spin_lock(&sunplus_console_ports[co->index]->port.lock);
+
+	uart_console_write(&sunplus_console_ports[co->index]->port, s, count,
+			   sunplus_uart_console_putchar);
+
+	if (locked)
+		spin_unlock(&sunplus_console_ports[co->index]->port.lock);
+
+	local_irq_restore(flags);
+}
+
+static int __init sunplus_console_setup(struct console *co, char *options)
+{
+	struct sunplus_uart_port *sup;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= SUP_UART_NR)
+		return -EINVAL;
+
+	sup = sunplus_console_ports[co->index];
+	if (!sup)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(&sup->port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver sunplus_uart_driver;
+static struct console sunplus_uart_console = {
+	.name		= "ttySUP",
+	.write		= sunplus_console_write,
+	.device		= uart_console_device,
+	.setup		= sunplus_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+	.data		= &sunplus_uart_driver
+};
+#endif
+
+static struct uart_driver sunplus_uart_driver = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "sunplus_uart",
+	.dev_name	= "ttySUP",
+	.major		= TTY_MAJOR,
+	.minor		= 64,
+	.nr		= SUP_UART_NR,
+	.cons		= &sunplus_uart_console,
+};
+
+static void sunplus_uart_disable_unprepare(void *data)
+{
+	clk_disable_unprepare(data);
+}
+
+static void sunplus_uart_reset_control_assert(void *data)
+{
+	reset_control_assert(data);
+}
+
+static int sunplus_uart_probe(struct platform_device *pdev)
+{
+	struct sunplus_uart_port *sup;
+	struct uart_port *port;
+	struct resource *res;
+	int ret, irq;
+
+	pdev->id = of_alias_get_id(pdev->dev.of_node, "serial");
+
+	if (pdev->id < 0 || pdev->id >= SUP_UART_NR)
+		return -EINVAL;
+
+	sup = devm_kzalloc(&pdev->dev, sizeof(*sup), GFP_KERNEL);
+	if (!sup)
+		return -ENOMEM;
+
+	sup->clk = devm_clk_get_optional(&pdev->dev, NULL);
+	if (IS_ERR(sup->clk))
+		return dev_err_probe(&pdev->dev, PTR_ERR(sup->clk), "clk not found\n");
+
+	ret = clk_prepare_enable(sup->clk);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(&pdev->dev, sunplus_uart_disable_unprepare, sup->clk);
+	if (ret)
+		return ret;
+
+	sup->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL);
+	if (IS_ERR(sup->rstc))
+		return dev_err_probe(&pdev->dev, PTR_ERR(sup->rstc), "rstc not found\n");
+
+	port = &sup->port;
+
+	port->membase = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+	if (IS_ERR(port->membase))
+		return dev_err_probe(&pdev->dev, PTR_ERR(port->membase), "membase not found\n");
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	port->mapbase = res->start;
+	port->uartclk = clk_get_rate(sup->clk);
+	port->line = pdev->id;
+	port->irq = irq;
+	port->dev = &pdev->dev;
+	port->iotype = UPIO_MEM;
+	port->ops = &sunplus_uart_ops;
+	port->flags = UPF_BOOT_AUTOCONF;
+	port->fifosize = 128;
+
+	ret = reset_control_deassert(sup->rstc);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(&pdev->dev, sunplus_uart_reset_control_assert, sup->rstc);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_SERIAL_SUNPLUS_CONSOLE
+	sunplus_console_ports[sup->port.line] = sup;
+#endif
+
+	platform_set_drvdata(pdev, &sup->port);
+
+	ret = uart_add_one_port(&sunplus_uart_driver, &sup->port);
+#ifdef CONFIG_SERIAL_SUNPLUS_CONSOLE
+	if (ret)
+		sunplus_console_ports[sup->port.line] = NULL;
+#endif
+
+	return ret;
+}
+
+static int sunplus_uart_remove(struct platform_device *pdev)
+{
+	struct sunplus_uart_port *sup = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&sunplus_uart_driver, &sup->port);
+
+	return 0;
+}
+
+static int __maybe_unused sunplus_uart_suspend(struct device *dev)
+{
+	struct sunplus_uart_port *sup = dev_get_drvdata(dev);
+
+	if (!uart_console(&sup->port))
+		uart_suspend_port(&sunplus_uart_driver, &sup->port);
+
+	return 0;
+}
+
+static int __maybe_unused sunplus_uart_resume(struct device *dev)
+{
+	struct sunplus_uart_port *sup = dev_get_drvdata(dev);
+
+	if (!uart_console(&sup->port))
+		uart_resume_port(&sunplus_uart_driver, &sup->port);
+
+	return 0;
+}
+
+static const struct dev_pm_ops sunplus_uart_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(sunplus_uart_suspend, sunplus_uart_resume)
+};
+
+static const struct of_device_id sp_uart_of_match[] = {
+	{ .compatible = "sunplus,sp7021-uart" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, sp_uart_of_match);
+
+static struct platform_driver sunplus_uart_platform_driver = {
+	.probe		= sunplus_uart_probe,
+	.remove		= sunplus_uart_remove,
+	.driver = {
+		.name	= "sunplus_uart",
+		.of_match_table = sp_uart_of_match,
+		.pm     = &sunplus_uart_pm_ops,
+	}
+};
+
+static int __init sunplus_uart_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&sunplus_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&sunplus_uart_platform_driver);
+	if (ret)
+		uart_unregister_driver(&sunplus_uart_driver);
+
+	return ret;
+}
+module_init(sunplus_uart_init);
+
+static void __exit sunplus_uart_exit(void)
+{
+	platform_driver_unregister(&sunplus_uart_platform_driver);
+	uart_unregister_driver(&sunplus_uart_driver);
+}
+module_exit(sunplus_uart_exit);
+
+#ifdef CONFIG_SERIAL_EARLYCON
+static void sunplus_uart_putc(struct uart_port *port, int c)
+{
+	unsigned int val;
+	int ret;
+
+	ret = readl_poll_timeout_atomic(port->membase + SUP_UART_LSR, val,
+					(val & UART_LSR_TEMT), 1, 10000);
+	if (ret)
+		return;
+
+	writel(c, port->membase + SUP_UART_DATA);
+}
+
+static void sunplus_uart_early_write(struct console *con, const char *s, unsigned int n)
+{
+	struct earlycon_device *dev = con->data;
+
+	uart_console_write(&dev->port, s, n, sunplus_uart_putc);
+}
+
+static int __init
+sunplus_uart_early_setup(struct earlycon_device *dev, const char *opt)
+{
+	if (!(dev->port.membase || dev->port.iobase))
+		return -ENODEV;
+
+	dev->con->write = sunplus_uart_early_write;
+
+	return 0;
+}
+OF_EARLYCON_DECLARE(sunplus_uart, "sunplus,sp7021-uart", sunplus_uart_early_setup);
+#endif
+
+MODULE_DESCRIPTION("Sunplus UART driver");
+MODULE_AUTHOR("Hammer Hsieh <hammerh0314@gmail.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 8885e69178bd..6faf502b7860 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -277,4 +277,7 @@
 /* Freescale LINFlexD UART */
 #define PORT_LINFLEXUART	122
 
+/* Sunplus UART */
+#define PORT_SUNPLUS	123
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From c2faf737abfb10f88f2d2612d573e9edc3c42c37 Mon Sep 17 00:00:00 2001
From: Max Staudt <max@enpas.org>
Date: Fri, 11 Feb 2022 15:10:36 +0100
Subject: tty: Reserve ldisc 29 for development purposes

It's handy to have an ldisc number free for out-of-tree testing. This
way, a new ldisc can be developed on any running system, without having
to recompile the kernel just to define a new number.

This is the highest number (and also the last one) available under the
old numbering scheme, so let's reserve it before it's too late.

From now on, every new ldisc upstreamed will have to increment NR_LDISCS
in lockstep with its addition to the table in tty.h.

Signed-off-by: Max Staudt <max@enpas.org>
Link: https://lore.kernel.org/r/20220211141036.6403-1-max@enpas.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index a58deb3061eb..9d0f06bfbac3 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -6,8 +6,6 @@
  * 'tty.h' defines some structures used by tty_io.c and some defines.
  */
 
-#define NR_LDISCS		30
-
 /* line disciplines */
 #define N_TTY		0
 #define N_SLIP		1
@@ -39,5 +37,9 @@
 #define N_SPEAKUP	26	/* Speakup communication with synths */
 #define N_NULL		27	/* Null ldisc used for error handling */
 #define N_MCTP		28	/* MCTP-over-serial */
+#define N_DEVELOPMENT	29	/* Manual out-of-tree testing */
+
+/* Always the newest line discipline + 1 */
+#define NR_LDISCS	30
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From fad278388e01e3658a356118bed8ee2c2408d280 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 24 May 2021 21:15:15 -0700
Subject: media: omap3isp: Use struct_group() for memcpy() region

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).

FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:

omap3isp_stat_request_statistics_time32(...)
    struct omap3isp_stat_data data64;
    ...
    omap3isp_stat_request_statistics(stat, &data64);

int omap3isp_stat_request_statistics(struct ispstat *stat,
                                     struct omap3isp_stat_data *data)
    ...
    buf = isp_stat_buf_get(stat, data);

static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
                                               struct omap3isp_stat_data *data)
...
    if (buf->buf_size > data->buf_size) {
            ...
            return ERR_PTR(-EINVAL);
    }
    ...
    rval = copy_to_user(data->buf,
                        buf->virt_addr,
                        buf->buf_size);

Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.

Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: linux-media@vger.kernel.org
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable@vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/media/platform/omap3isp/ispstat.c |  5 +++--
 include/uapi/linux/omap3isp.h             | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
 int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
 					struct omap3isp_stat_data_time32 *data)
 {
-	struct omap3isp_stat_data data64;
+	struct omap3isp_stat_data data64 = { };
 	int ret;
 
 	ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
 
 	data->ts.tv_sec = data64.ts.tv_sec;
 	data->ts.tv_usec = data64.ts.tv_usec;
-	memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+	data->buf = (uintptr_t)data64.buf;
+	memcpy(&data->frame, &data64.frame, sizeof(data->frame));
 
 	return 0;
 }
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
  * struct omap3isp_stat_data - Statistic data sent to or received from user
  * @ts: Timestamp of returned framestats.
  * @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
  * @frame_number: Frame number of requested stats.
  * @cur_frame: Current frame number being processed.
  * @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
 	struct timeval ts;
 #endif
 	void __user *buf;
-	__u32 buf_size;
-	__u16 frame_number;
-	__u16 cur_frame;
-	__u16 config_counter;
+	__struct_group(/* no tag */, frame, /* no attrs */,
+		__u32 buf_size;
+		__u16 frame_number;
+		__u16 cur_frame;
+		__u16 config_counter;
+	);
 };
 
 #ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
 		__s32	tv_usec;
 	} ts;
 	__u32 buf;
-	__u32 buf_size;
-	__u16 frame_number;
-	__u16 cur_frame;
-	__u16 config_counter;
+	__struct_group(/* no tag */, frame, /* no attrs */,
+		__u32 buf_size;
+		__u16 frame_number;
+		__u16 cur_frame;
+		__u16 config_counter;
+	);
 };
 #endif
 
-- 
cgit v1.2.3


From 89377bc1975c2993bde4a498a3a4e5817ac0ae2c Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Thu, 10 Feb 2022 11:07:55 +0530
Subject: nvme: add vectored-io support for user-passthrough

Add a new NVME_IOCTL_IO64_CMD_VEC ioctl that works like the existing
NVME_IOCTL_IO64_CMD ioctl except that it takes and array of iovecs
and thus supports vectored I/O.

  - cmd.addr is base address of user iovec array
  - cmd.vec_cnt is count of iovec array elements

This patch does not include vectored-variant for admin-commands as most
of them are light on buffers and likely to have low invocation frequency.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/ioctl.c       | 35 ++++++++++++++++++++++++++---------
 include/uapi/linux/nvme_ioctl.h |  6 +++++-
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 22314962842d..aaf3dfad2657 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -56,7 +56,7 @@ out:
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout)
+		u32 meta_seed, u64 *result, unsigned timeout, bool vec)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -75,8 +75,22 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	nvme_req(req)->flags |= NVME_REQ_USERCMD;
 
 	if (ubuffer && bufflen) {
-		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+		if (!vec)
+			ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
 				GFP_KERNEL);
+		else {
+			struct iovec fast_iov[UIO_FASTIOV];
+			struct iovec *iov = fast_iov;
+			struct iov_iter iter;
+
+			ret = import_iovec(rq_data_dir(req), ubuffer, bufflen,
+					UIO_FASTIOV, &iov, &iter);
+			if (ret < 0)
+				goto out;
+			ret = blk_rq_map_user_iov(q, req, NULL, &iter,
+					GFP_KERNEL);
+			kfree(iov);
+		}
 		if (ret)
 			goto out;
 		bio = req->bio;
@@ -170,7 +184,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	return nvme_submit_user_cmd(ns->queue, &c,
 			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+			false);
 }
 
 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -224,7 +239,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &result, timeout);
+			0, &result, timeout, false);
 
 	if (status >= 0) {
 		if (put_user(result, &ucmd->result))
@@ -235,7 +250,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd64 __user *ucmd)
+			struct nvme_passthru_cmd64 __user *ucmd, bool vec)
 {
 	struct nvme_passthru_cmd64 cmd;
 	struct nvme_command c;
@@ -270,7 +285,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &cmd.result, timeout);
+			0, &cmd.result, timeout, vec);
 
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
@@ -296,7 +311,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ctrl, NULL, argp);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, false);
 	default:
 		return sed_ioctl(ctrl->opal_dev, cmd, argp);
 	}
@@ -340,7 +355,9 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, argp);
 	case NVME_IOCTL_IO64_CMD:
-		return nvme_user_cmd64(ns->ctrl, ns, argp);
+		return nvme_user_cmd64(ns->ctrl, ns, argp, false);
+	case NVME_IOCTL_IO64_CMD_VEC:
+		return nvme_user_cmd64(ns->ctrl, ns, argp, true);
 	default:
 		return -ENOTTY;
 	}
@@ -480,7 +497,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ctrl, NULL, argp);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, false);
 	case NVME_IOCTL_IO_CMD:
 		return nvme_dev_user_cmd(ctrl, argp);
 	case NVME_IOCTL_RESET:
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index d99b5a772698..b2e43185e3b5 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -55,7 +55,10 @@ struct nvme_passthru_cmd64 {
 	__u64	metadata;
 	__u64	addr;
 	__u32	metadata_len;
-	__u32	data_len;
+	union {
+		__u32	data_len; /* for non-vectored io */
+		__u32	vec_cnt; /* for vectored io */
+	};
 	__u32	cdw10;
 	__u32	cdw11;
 	__u32	cdw12;
@@ -78,5 +81,6 @@ struct nvme_passthru_cmd64 {
 #define NVME_IOCTL_RESCAN	_IO('N', 0x46)
 #define NVME_IOCTL_ADMIN64_CMD	_IOWR('N', 0x47, struct nvme_passthru_cmd64)
 #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64)
+#define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
cgit v1.2.3


From 0c9f178778919bb500443f340647b44227778fb2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 16 Feb 2022 10:52:42 +0800
Subject: iommu: Remove guest pasid related interfaces and definitions

The guest pasid related uapi interfaces and definitions are not referenced
anywhere in the tree. We've also reached a consensus to replace them with
a new iommufd design. Remove them to avoid dead code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220216025249.3459465-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c      | 210 ---------------------------------------------
 include/linux/iommu.h      |  44 ----------
 include/uapi/linux/iommu.h | 181 --------------------------------------
 3 files changed, 435 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 107dcf5938d6..3cbf4781e5bd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2031,216 +2031,6 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 	return 0;
 }
 
-/*
- * Check flags and other user provided data for valid combinations. We also
- * make sure no reserved fields or unused flags are set. This is to ensure
- * not breaking userspace in the future when these fields or flags are used.
- */
-static int iommu_check_cache_invl_data(struct iommu_cache_invalidate_info *info)
-{
-	u32 mask;
-	int i;
-
-	if (info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
-		return -EINVAL;
-
-	mask = (1 << IOMMU_CACHE_INV_TYPE_NR) - 1;
-	if (info->cache & ~mask)
-		return -EINVAL;
-
-	if (info->granularity >= IOMMU_INV_GRANU_NR)
-		return -EINVAL;
-
-	switch (info->granularity) {
-	case IOMMU_INV_GRANU_ADDR:
-		if (info->cache & IOMMU_CACHE_INV_TYPE_PASID)
-			return -EINVAL;
-
-		mask = IOMMU_INV_ADDR_FLAGS_PASID |
-			IOMMU_INV_ADDR_FLAGS_ARCHID |
-			IOMMU_INV_ADDR_FLAGS_LEAF;
-
-		if (info->granu.addr_info.flags & ~mask)
-			return -EINVAL;
-		break;
-	case IOMMU_INV_GRANU_PASID:
-		mask = IOMMU_INV_PASID_FLAGS_PASID |
-			IOMMU_INV_PASID_FLAGS_ARCHID;
-		if (info->granu.pasid_info.flags & ~mask)
-			return -EINVAL;
-
-		break;
-	case IOMMU_INV_GRANU_DOMAIN:
-		if (info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* Check reserved padding fields */
-	for (i = 0; i < sizeof(info->padding); i++) {
-		if (info->padding[i])
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-int iommu_uapi_cache_invalidate(struct iommu_domain *domain, struct device *dev,
-				void __user *uinfo)
-{
-	struct iommu_cache_invalidate_info inv_info = { 0 };
-	u32 minsz;
-	int ret;
-
-	if (unlikely(!domain->ops->cache_invalidate))
-		return -ENODEV;
-
-	/*
-	 * No new spaces can be added before the variable sized union, the
-	 * minimum size is the offset to the union.
-	 */
-	minsz = offsetof(struct iommu_cache_invalidate_info, granu);
-
-	/* Copy minsz from user to get flags and argsz */
-	if (copy_from_user(&inv_info, uinfo, minsz))
-		return -EFAULT;
-
-	/* Fields before the variable size union are mandatory */
-	if (inv_info.argsz < minsz)
-		return -EINVAL;
-
-	/* PASID and address granu require additional info beyond minsz */
-	if (inv_info.granularity == IOMMU_INV_GRANU_PASID &&
-	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.pasid_info))
-		return -EINVAL;
-
-	if (inv_info.granularity == IOMMU_INV_GRANU_ADDR &&
-	    inv_info.argsz < offsetofend(struct iommu_cache_invalidate_info, granu.addr_info))
-		return -EINVAL;
-
-	/*
-	 * User might be using a newer UAPI header which has a larger data
-	 * size, we shall support the existing flags within the current
-	 * size. Copy the remaining user data _after_ minsz but not more
-	 * than the current kernel supported size.
-	 */
-	if (copy_from_user((void *)&inv_info + minsz, uinfo + minsz,
-			   min_t(u32, inv_info.argsz, sizeof(inv_info)) - minsz))
-		return -EFAULT;
-
-	/* Now the argsz is validated, check the content */
-	ret = iommu_check_cache_invl_data(&inv_info);
-	if (ret)
-		return ret;
-
-	return domain->ops->cache_invalidate(domain, dev, &inv_info);
-}
-EXPORT_SYMBOL_GPL(iommu_uapi_cache_invalidate);
-
-static int iommu_check_bind_data(struct iommu_gpasid_bind_data *data)
-{
-	u64 mask;
-	int i;
-
-	if (data->version != IOMMU_GPASID_BIND_VERSION_1)
-		return -EINVAL;
-
-	/* Check the range of supported formats */
-	if (data->format >= IOMMU_PASID_FORMAT_LAST)
-		return -EINVAL;
-
-	/* Check all flags */
-	mask = IOMMU_SVA_GPASID_VAL;
-	if (data->flags & ~mask)
-		return -EINVAL;
-
-	/* Check reserved padding fields */
-	for (i = 0; i < sizeof(data->padding); i++) {
-		if (data->padding[i])
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int iommu_sva_prepare_bind_data(void __user *udata,
-				       struct iommu_gpasid_bind_data *data)
-{
-	u32 minsz;
-
-	/*
-	 * No new spaces can be added before the variable sized union, the
-	 * minimum size is the offset to the union.
-	 */
-	minsz = offsetof(struct iommu_gpasid_bind_data, vendor);
-
-	/* Copy minsz from user to get flags and argsz */
-	if (copy_from_user(data, udata, minsz))
-		return -EFAULT;
-
-	/* Fields before the variable size union are mandatory */
-	if (data->argsz < minsz)
-		return -EINVAL;
-	/*
-	 * User might be using a newer UAPI header, we shall let IOMMU vendor
-	 * driver decide on what size it needs. Since the guest PASID bind data
-	 * can be vendor specific, larger argsz could be the result of extension
-	 * for one vendor but it should not affect another vendor.
-	 * Copy the remaining user data _after_ minsz
-	 */
-	if (copy_from_user((void *)data + minsz, udata + minsz,
-			   min_t(u32, data->argsz, sizeof(*data)) - minsz))
-		return -EFAULT;
-
-	return iommu_check_bind_data(data);
-}
-
-int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain, struct device *dev,
-			       void __user *udata)
-{
-	struct iommu_gpasid_bind_data data = { 0 };
-	int ret;
-
-	if (unlikely(!domain->ops->sva_bind_gpasid))
-		return -ENODEV;
-
-	ret = iommu_sva_prepare_bind_data(udata, &data);
-	if (ret)
-		return ret;
-
-	return domain->ops->sva_bind_gpasid(domain, dev, &data);
-}
-EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
-
-int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
-			     ioasid_t pasid)
-{
-	if (unlikely(!domain->ops->sva_unbind_gpasid))
-		return -ENODEV;
-
-	return domain->ops->sva_unbind_gpasid(dev, pasid);
-}
-EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
-
-int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
-				 void __user *udata)
-{
-	struct iommu_gpasid_bind_data data = { 0 };
-	int ret;
-
-	if (unlikely(!domain->ops->sva_bind_gpasid))
-		return -ENODEV;
-
-	ret = iommu_sva_prepare_bind_data(udata, &data);
-	if (ret)
-		return ret;
-
-	return iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
-}
-EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
-
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index de0c57a567c8..cde1d0aad60f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -229,9 +229,6 @@ struct iommu_iotlb_gather {
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
  * @page_response: handle page request response
- * @cache_invalidate: invalidate translation caches
- * @sva_bind_gpasid: bind guest pasid and mm
- * @sva_unbind_gpasid: unbind guest pasid and mm
  * @def_domain_type: device default domain type, return value:
  *		- IOMMU_DOMAIN_IDENTITY: must use an identity domain
  *		- IOMMU_DOMAIN_DMA: must use a dma domain
@@ -301,12 +298,6 @@ struct iommu_ops {
 	int (*page_response)(struct device *dev,
 			     struct iommu_fault_event *evt,
 			     struct iommu_page_response *msg);
-	int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
-				struct iommu_cache_invalidate_info *inv_info);
-	int (*sva_bind_gpasid)(struct iommu_domain *domain,
-			struct device *dev, struct iommu_gpasid_bind_data *data);
-
-	int (*sva_unbind_gpasid)(struct device *dev, u32 pasid);
 
 	int (*def_domain_type)(struct device *dev);
 
@@ -421,14 +412,6 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
-extern int iommu_uapi_cache_invalidate(struct iommu_domain *domain,
-				       struct device *dev,
-				       void __user *uinfo);
-
-extern int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-				      struct device *dev, void __user *udata);
-extern int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
-					struct device *dev, void __user *udata);
 extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
 				   struct device *dev, ioasid_t pasid);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
@@ -1051,33 +1034,6 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 	return IOMMU_PASID_INVALID;
 }
 
-static inline int
-iommu_uapi_cache_invalidate(struct iommu_domain *domain,
-			    struct device *dev,
-			    struct iommu_cache_invalidate_info *inv_info)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_uapi_sva_bind_gpasid(struct iommu_domain *domain,
-					     struct device *dev, void __user *udata)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_uapi_sva_unbind_gpasid(struct iommu_domain *domain,
-					       struct device *dev, void __user *udata)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
-					  struct device *dev,
-					  ioasid_t pasid)
-{
-	return -ENODEV;
-}
-
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
 	return NULL;
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 59178fc229ca..65d8b0234f69 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -158,185 +158,4 @@ struct iommu_page_response {
 	__u32	code;
 };
 
-/* defines the granularity of the invalidation */
-enum iommu_inv_granularity {
-	IOMMU_INV_GRANU_DOMAIN,	/* domain-selective invalidation */
-	IOMMU_INV_GRANU_PASID,	/* PASID-selective invalidation */
-	IOMMU_INV_GRANU_ADDR,	/* page-selective invalidation */
-	IOMMU_INV_GRANU_NR,	/* number of invalidation granularities */
-};
-
-/**
- * struct iommu_inv_addr_info - Address Selective Invalidation Structure
- *
- * @flags: indicates the granularity of the address-selective invalidation
- * - If the PASID bit is set, the @pasid field is populated and the invalidation
- *   relates to cache entries tagged with this PASID and matching the address
- *   range.
- * - If ARCHID bit is set, @archid is populated and the invalidation relates
- *   to cache entries tagged with this architecture specific ID and matching
- *   the address range.
- * - Both PASID and ARCHID can be set as they may tag different caches.
- * - If neither PASID or ARCHID is set, global addr invalidation applies.
- * - The LEAF flag indicates whether only the leaf PTE caching needs to be
- *   invalidated and other paging structure caches can be preserved.
- * @pasid: process address space ID
- * @archid: architecture-specific ID
- * @addr: first stage/level input address
- * @granule_size: page/block size of the mapping in bytes
- * @nb_granules: number of contiguous granules to be invalidated
- */
-struct iommu_inv_addr_info {
-#define IOMMU_INV_ADDR_FLAGS_PASID	(1 << 0)
-#define IOMMU_INV_ADDR_FLAGS_ARCHID	(1 << 1)
-#define IOMMU_INV_ADDR_FLAGS_LEAF	(1 << 2)
-	__u32	flags;
-	__u32	archid;
-	__u64	pasid;
-	__u64	addr;
-	__u64	granule_size;
-	__u64	nb_granules;
-};
-
-/**
- * struct iommu_inv_pasid_info - PASID Selective Invalidation Structure
- *
- * @flags: indicates the granularity of the PASID-selective invalidation
- * - If the PASID bit is set, the @pasid field is populated and the invalidation
- *   relates to cache entries tagged with this PASID and matching the address
- *   range.
- * - If the ARCHID bit is set, the @archid is populated and the invalidation
- *   relates to cache entries tagged with this architecture specific ID and
- *   matching the address range.
- * - Both PASID and ARCHID can be set as they may tag different caches.
- * - At least one of PASID or ARCHID must be set.
- * @pasid: process address space ID
- * @archid: architecture-specific ID
- */
-struct iommu_inv_pasid_info {
-#define IOMMU_INV_PASID_FLAGS_PASID	(1 << 0)
-#define IOMMU_INV_PASID_FLAGS_ARCHID	(1 << 1)
-	__u32	flags;
-	__u32	archid;
-	__u64	pasid;
-};
-
-/**
- * struct iommu_cache_invalidate_info - First level/stage invalidation
- *     information
- * @argsz: User filled size of this data
- * @version: API version of this structure
- * @cache: bitfield that allows to select which caches to invalidate
- * @granularity: defines the lowest granularity used for the invalidation:
- *     domain > PASID > addr
- * @padding: reserved for future use (should be zero)
- * @pasid_info: invalidation data when @granularity is %IOMMU_INV_GRANU_PASID
- * @addr_info: invalidation data when @granularity is %IOMMU_INV_GRANU_ADDR
- *
- * Not all the combinations of cache/granularity are valid:
- *
- * +--------------+---------------+---------------+---------------+
- * | type /       |   DEV_IOTLB   |     IOTLB     |      PASID    |
- * | granularity  |               |               |      cache    |
- * +==============+===============+===============+===============+
- * | DOMAIN       |       N/A     |       Y       |       Y       |
- * +--------------+---------------+---------------+---------------+
- * | PASID        |       Y       |       Y       |       Y       |
- * +--------------+---------------+---------------+---------------+
- * | ADDR         |       Y       |       Y       |       N/A     |
- * +--------------+---------------+---------------+---------------+
- *
- * Invalidations by %IOMMU_INV_GRANU_DOMAIN don't take any argument other than
- * @version and @cache.
- *
- * If multiple cache types are invalidated simultaneously, they all
- * must support the used granularity.
- */
-struct iommu_cache_invalidate_info {
-	__u32	argsz;
-#define IOMMU_CACHE_INVALIDATE_INFO_VERSION_1 1
-	__u32	version;
-/* IOMMU paging structure cache */
-#define IOMMU_CACHE_INV_TYPE_IOTLB	(1 << 0) /* IOMMU IOTLB */
-#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB	(1 << 1) /* Device IOTLB */
-#define IOMMU_CACHE_INV_TYPE_PASID	(1 << 2) /* PASID cache */
-#define IOMMU_CACHE_INV_TYPE_NR		(3)
-	__u8	cache;
-	__u8	granularity;
-	__u8	padding[6];
-	union {
-		struct iommu_inv_pasid_info pasid_info;
-		struct iommu_inv_addr_info addr_info;
-	} granu;
-};
-
-/**
- * struct iommu_gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
- * SVA binding.
- *
- * @flags:	VT-d PASID table entry attributes
- * @pat:	Page attribute table data to compute effective memory type
- * @emt:	Extended memory type
- *
- * Only guest vIOMMU selectable and effective options are passed down to
- * the host IOMMU.
- */
-struct iommu_gpasid_bind_data_vtd {
-#define IOMMU_SVA_VTD_GPASID_SRE	(1 << 0) /* supervisor request */
-#define IOMMU_SVA_VTD_GPASID_EAFE	(1 << 1) /* extended access enable */
-#define IOMMU_SVA_VTD_GPASID_PCD	(1 << 2) /* page-level cache disable */
-#define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
-#define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
-#define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
-#define IOMMU_SVA_VTD_GPASID_WPE	(1 << 6) /* Write protect enable */
-#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 7)
-	__u64 flags;
-	__u32 pat;
-	__u32 emt;
-};
-
-#define IOMMU_SVA_VTD_GPASID_MTS_MASK	(IOMMU_SVA_VTD_GPASID_CD | \
-					 IOMMU_SVA_VTD_GPASID_EMTE | \
-					 IOMMU_SVA_VTD_GPASID_PCD |  \
-					 IOMMU_SVA_VTD_GPASID_PWT)
-
-/**
- * struct iommu_gpasid_bind_data - Information about device and guest PASID binding
- * @argsz:	User filled size of this data
- * @version:	Version of this data structure
- * @format:	PASID table entry format
- * @flags:	Additional information on guest bind request
- * @gpgd:	Guest page directory base of the guest mm to bind
- * @hpasid:	Process address space ID used for the guest mm in host IOMMU
- * @gpasid:	Process address space ID used for the guest mm in guest IOMMU
- * @addr_width:	Guest virtual address width
- * @padding:	Reserved for future use (should be zero)
- * @vtd:	Intel VT-d specific data
- *
- * Guest to host PASID mapping can be an identity or non-identity, where guest
- * has its own PASID space. For non-identify mapping, guest to host PASID lookup
- * is needed when VM programs guest PASID into an assigned device. VMM may
- * trap such PASID programming then request host IOMMU driver to convert guest
- * PASID to host PASID based on this bind data.
- */
-struct iommu_gpasid_bind_data {
-	__u32 argsz;
-#define IOMMU_GPASID_BIND_VERSION_1	1
-	__u32 version;
-#define IOMMU_PASID_FORMAT_INTEL_VTD	1
-#define IOMMU_PASID_FORMAT_LAST		2
-	__u32 format;
-	__u32 addr_width;
-#define IOMMU_SVA_GPASID_VAL	(1 << 0) /* guest PASID valid */
-	__u64 flags;
-	__u64 gpgd;
-	__u64 hpasid;
-	__u64 gpasid;
-	__u8  padding[8];
-	/* Vendor specific data */
-	union {
-		struct iommu_gpasid_bind_data_vtd vtd;
-	} vendor;
-};
-
 #endif /* _UAPI_IOMMU_H */
-- 
cgit v1.2.3


From 7b8135f4df98b155b23754b6065c157861e268f1 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@nvidia.com>
Date: Tue, 1 Mar 2022 05:04:34 +0000
Subject: rtnetlink: add new rtm tunnel api for tunnel id filtering

This patch adds new rtm tunnel msg and api for tunnel id
filtering in dst_metadata devices. First dst_metadata
device to use the api is vxlan driver with AF_BRIDGE
family.

This and later changes add ability in vxlan driver to do
tunnel id filtering (or vni filtering) on dst_metadata
devices. This is similar to vlan api in the vlan filtering bridge.

this patch includes selinux nlmsg_route_perms support for RTM_*TUNNEL
api from Benjamin Poirier.

Signed-off-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h   | 26 ++++++++++++++++++++++++++
 include/uapi/linux/rtnetlink.h |  9 +++++++++
 security/selinux/nlmsgtab.c    |  5 ++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index be09d2ad4b5d..3dfc9ff2ec9b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -713,7 +713,32 @@ enum ipvlan_mode {
 #define IPVLAN_F_PRIVATE	0x01
 #define IPVLAN_F_VEPA		0x02
 
+/* Tunnel RTM header */
+struct tunnel_msg {
+	__u8 family;
+	__u8 reserved1;
+	__u16 reserved2;
+	__u32 ifindex;
+};
+
 /* VXLAN section */
+enum {
+	VXLAN_VNIFILTER_ENTRY_UNSPEC,
+	VXLAN_VNIFILTER_ENTRY_START,
+	VXLAN_VNIFILTER_ENTRY_END,
+	VXLAN_VNIFILTER_ENTRY_GROUP,
+	VXLAN_VNIFILTER_ENTRY_GROUP6,
+	__VXLAN_VNIFILTER_ENTRY_MAX
+};
+#define VXLAN_VNIFILTER_ENTRY_MAX	(__VXLAN_VNIFILTER_ENTRY_MAX - 1)
+
+enum {
+	VXLAN_VNIFILTER_UNSPEC,
+	VXLAN_VNIFILTER_ENTRY,
+	__VXLAN_VNIFILTER_MAX
+};
+#define VXLAN_VNIFILTER_MAX	(__VXLAN_VNIFILTER_MAX - 1)
+
 enum {
 	IFLA_VXLAN_UNSPEC,
 	IFLA_VXLAN_ID,
@@ -745,6 +770,7 @@ enum {
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
 	IFLA_VXLAN_DF,
+	IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 93d934cc4613..0970cb4b1b88 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -185,6 +185,13 @@ enum {
 	RTM_GETNEXTHOPBUCKET,
 #define RTM_GETNEXTHOPBUCKET	RTM_GETNEXTHOPBUCKET
 
+	RTM_NEWTUNNEL = 120,
+#define RTM_NEWTUNNEL	RTM_NEWTUNNEL
+	RTM_DELTUNNEL,
+#define RTM_DELTUNNEL	RTM_DELTUNNEL
+	RTM_GETTUNNEL,
+#define RTM_GETTUNNEL	RTM_GETTUNNEL
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -756,6 +763,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_BRVLAN		RTNLGRP_BRVLAN
 	RTNLGRP_MCTP_IFADDR,
 #define RTNLGRP_MCTP_IFADDR	RTNLGRP_MCTP_IFADDR
+	RTNLGRP_TUNNEL,
+#define RTNLGRP_TUNNEL		RTNLGRP_TUNNEL
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 94ea2a8b2bb7..6ad3ee02e023 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -91,6 +91,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWTUNNEL,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELTUNNEL,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_GETTUNNEL,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -176,7 +179,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		 * structures at the top of this file with the new mappings
 		 * before updating the BUILD_BUG_ON() macro!
 		 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3


From 445b2f36bb4efb81f064e931f28b9ec19f114355 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 1 Mar 2022 05:04:39 +0000
Subject: drivers: vxlan: vnifilter: add support for stats dumping

Add support for VXLAN vni filter entries' stats dumping

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan/vxlan_vnifilter.c | 92 ++++++++++++++++++++++++++++++++++---
 include/uapi/linux/if_link.h        | 25 +++++++++-
 2 files changed, 110 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
index 2d23312f4f62..9f28d0b6a6b2 100644
--- a/drivers/net/vxlan/vxlan_vnifilter.c
+++ b/drivers/net/vxlan/vxlan_vnifilter.c
@@ -116,6 +116,34 @@ void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan)
 	spin_unlock(&vn->sock_lock);
 }
 
+static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode,
+				      struct vxlan_vni_stats *dest)
+{
+	int i;
+
+	memset(dest, 0, sizeof(*dest));
+	for_each_possible_cpu(i) {
+		struct vxlan_vni_stats_pcpu *pstats;
+		struct vxlan_vni_stats temp;
+		unsigned int start;
+
+		pstats = per_cpu_ptr(vninode->stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&pstats->syncp);
+			memcpy(&temp, &pstats->stats, sizeof(temp));
+		} while (u64_stats_fetch_retry_irq(&pstats->syncp, start));
+
+		dest->rx_packets += temp.rx_packets;
+		dest->rx_bytes += temp.rx_bytes;
+		dest->rx_drops += temp.rx_drops;
+		dest->rx_errors += temp.rx_errors;
+		dest->tx_packets += temp.tx_packets;
+		dest->tx_bytes += temp.tx_bytes;
+		dest->tx_drops += temp.tx_drops;
+		dest->tx_errors += temp.tx_errors;
+	}
+}
+
 static void vxlan_vnifilter_stats_add(struct vxlan_vni_node *vninode,
 				      int type, unsigned int len)
 {
@@ -182,9 +210,48 @@ static size_t vxlan_vnifilter_entry_nlmsg_size(void)
 		+ nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */
 }
 
+static int __vnifilter_entry_fill_stats(struct sk_buff *skb,
+					const struct vxlan_vni_node *vbegin)
+{
+	struct vxlan_vni_stats vstats;
+	struct nlattr *vstats_attr;
+
+	vstats_attr = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY_STATS);
+	if (!vstats_attr)
+		goto out_stats_err;
+
+	vxlan_vnifilter_stats_get(vbegin, &vstats);
+	if (nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_BYTES,
+			      vstats.rx_bytes, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_PKTS,
+			      vstats.rx_packets, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_DROPS,
+			      vstats.rx_drops, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_ERRORS,
+			      vstats.rx_errors, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_BYTES,
+			      vstats.tx_bytes, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_PKTS,
+			      vstats.tx_packets, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_DROPS,
+			      vstats.tx_drops, VNIFILTER_ENTRY_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_ERRORS,
+			      vstats.tx_errors, VNIFILTER_ENTRY_STATS_PAD))
+		goto out_stats_err;
+
+	nla_nest_end(skb, vstats_attr);
+
+	return 0;
+
+out_stats_err:
+	nla_nest_cancel(skb, vstats_attr);
+	return -EMSGSIZE;
+}
+
 static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb,
 					struct vxlan_vni_node *vbegin,
-					struct vxlan_vni_node *vend)
+					struct vxlan_vni_node *vend,
+					bool fill_stats)
 {
 	struct nlattr *ventry;
 	u32 vs = be32_to_cpu(vbegin->vni);
@@ -217,6 +284,9 @@ static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb,
 		}
 	}
 
+	if (fill_stats && __vnifilter_entry_fill_stats(skb, vbegin))
+		goto out_err;
+
 	nla_nest_end(skb, ventry);
 
 	return true;
@@ -249,7 +319,7 @@ static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan,
 	tmsg->family = AF_BRIDGE;
 	tmsg->ifindex = vxlan->dev->ifindex;
 
-	if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode))
+	if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode, false))
 		goto out_err;
 
 	nlmsg_end(skb, nlh);
@@ -269,10 +339,11 @@ static int vxlan_vnifilter_dump_dev(const struct net_device *dev,
 {
 	struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct tunnel_msg *new_tmsg;
+	struct tunnel_msg *new_tmsg, *tmsg;
 	int idx = 0, s_idx = cb->args[1];
 	struct vxlan_vni_group *vg;
 	struct nlmsghdr *nlh;
+	bool dump_stats;
 	int err = 0;
 
 	if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
@@ -283,6 +354,9 @@ static int vxlan_vnifilter_dump_dev(const struct net_device *dev,
 	if (!vg || !vg->num_vnis)
 		return 0;
 
+	tmsg = nlmsg_data(cb->nlh);
+	dump_stats = !!(tmsg->flags & TUNNEL_MSG_FLAG_STATS);
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI);
 	if (!nlh)
@@ -302,11 +376,12 @@ static int vxlan_vnifilter_dump_dev(const struct net_device *dev,
 			vend = v;
 			continue;
 		}
-		if (vnirange(vend, v) == 1 &&
+		if (!dump_stats && vnirange(vend, v) == 1 &&
 		    vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) {
 			goto update_end;
 		} else {
-			if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend)) {
+			if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend,
+							 dump_stats)) {
 				err = -EMSGSIZE;
 				break;
 			}
@@ -318,7 +393,7 @@ update_end:
 	}
 
 	if (!err && vbegin) {
-		if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend))
+		if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats))
 			err = -EMSGSIZE;
 	}
 
@@ -338,6 +413,11 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb
 
 	tmsg = nlmsg_data(cb->nlh);
 
+	if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) {
+		NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header");
+		return -EINVAL;
+	}
+
 	rcu_read_lock();
 	if (tmsg->ifindex) {
 		dev = dev_get_by_index_rcu(net, tmsg->ifindex);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3dfc9ff2ec9b..e315e53125f4 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -716,18 +716,41 @@ enum ipvlan_mode {
 /* Tunnel RTM header */
 struct tunnel_msg {
 	__u8 family;
-	__u8 reserved1;
+	__u8 flags;
 	__u16 reserved2;
 	__u32 ifindex;
 };
 
 /* VXLAN section */
+
+/* include statistics in the dump */
+#define TUNNEL_MSG_FLAG_STATS	0x01
+
+#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS
+
+/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */
+enum {
+	VNIFILTER_ENTRY_STATS_UNSPEC,
+	VNIFILTER_ENTRY_STATS_RX_BYTES,
+	VNIFILTER_ENTRY_STATS_RX_PKTS,
+	VNIFILTER_ENTRY_STATS_RX_DROPS,
+	VNIFILTER_ENTRY_STATS_RX_ERRORS,
+	VNIFILTER_ENTRY_STATS_TX_BYTES,
+	VNIFILTER_ENTRY_STATS_TX_PKTS,
+	VNIFILTER_ENTRY_STATS_TX_DROPS,
+	VNIFILTER_ENTRY_STATS_TX_ERRORS,
+	VNIFILTER_ENTRY_STATS_PAD,
+	__VNIFILTER_ENTRY_STATS_MAX
+};
+#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1)
+
 enum {
 	VXLAN_VNIFILTER_ENTRY_UNSPEC,
 	VXLAN_VNIFILTER_ENTRY_START,
 	VXLAN_VNIFILTER_ENTRY_END,
 	VXLAN_VNIFILTER_ENTRY_GROUP,
 	VXLAN_VNIFILTER_ENTRY_GROUP6,
+	VXLAN_VNIFILTER_ENTRY_STATS,
 	__VXLAN_VNIFILTER_ENTRY_MAX
 };
 #define VXLAN_VNIFILTER_ENTRY_MAX	(__VXLAN_VNIFILTER_ENTRY_MAX - 1)
-- 
cgit v1.2.3


From cedd3614e5d9c80908099c19f8716714ce0610b1 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 24 Feb 2022 11:06:54 +0530
Subject: perf: Add irq and exception return branch types

This expands generic branch type classification by adding two more entries
there in i.e irq and exception return. Also updates the x86 implementation
to process X86_BR_IRET and X86_BR_IRQ records as appropriate. This changes
branch types reported to user space on x86 platform but it should not be a
problem. The possible scenarios and impacts are enumerated here.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1645681014-3346-1-git-send-email-anshuman.khandual@arm.com
---
 arch/x86/events/intel/lbr.c           | 4 ++--
 include/uapi/linux/perf_event.h       | 2 ++
 tools/include/uapi/linux/perf_event.h | 2 ++
 tools/perf/util/branch.c              | 4 +++-
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 669c2be14784..fe1742c4ca49 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1329,10 +1329,10 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = {
 	PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
 	PERF_BR_SYSRET,		/* X86_BR_SYSRET */
 	PERF_BR_UNKNOWN,	/* X86_BR_INT */
-	PERF_BR_UNKNOWN,	/* X86_BR_IRET */
+	PERF_BR_ERET,		/* X86_BR_IRET */
 	PERF_BR_COND,		/* X86_BR_JCC */
 	PERF_BR_UNCOND,		/* X86_BR_JMP */
-	PERF_BR_UNKNOWN,	/* X86_BR_IRQ */
+	PERF_BR_IRQ,		/* X86_BR_IRQ */
 	PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
 	PERF_BR_UNKNOWN,	/* X86_BR_ABORT */
 	PERF_BR_UNKNOWN,	/* X86_BR_IN_TX */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1b65042ab1db..7dc71768749d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -251,6 +251,8 @@ enum {
 	PERF_BR_SYSRET		= 8,	/* syscall return */
 	PERF_BR_COND_CALL	= 9,	/* conditional function call */
 	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_ERET		= 11,	/* exception return */
+	PERF_BR_IRQ		= 12,	/* irq */
 	PERF_BR_MAX,
 };
 
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 4cd39aaccbe7..d1324c427e8b 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -251,6 +251,8 @@ enum {
 	PERF_BR_SYSRET		= 8,	/* syscall return */
 	PERF_BR_COND_CALL	= 9,	/* conditional function call */
 	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_ERET		= 11,	/* exception return */
+	PERF_BR_IRQ		= 12,	/* irq */
 	PERF_BR_MAX,
 };
 
diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c
index 2285b1eb3128..a9a909db8cc7 100644
--- a/tools/perf/util/branch.c
+++ b/tools/perf/util/branch.c
@@ -49,7 +49,9 @@ const char *branch_type_name(int type)
 		"SYSCALL",
 		"SYSRET",
 		"COND_CALL",
-		"COND_RET"
+		"COND_RET",
+		"ERET",
+		"IRQ"
 	};
 
 	if (type >= 0 && type < PERF_BR_MAX)
-- 
cgit v1.2.3


From 4f0bfdfd8323e5b461fc1042143a1097dba9fced Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 23 Feb 2022 19:34:31 -0800
Subject: ELF: Properly redefine PT_GNU_* in terms of PT_LOOS

The PT_GNU_* program header types are actually offsets from PT_LOOS,
so redefine them as such, reorder them, and add the missing PT_GNU_RELRO.

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/elf.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 61bf4774b8f2..6438d55529bf 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -35,10 +35,11 @@ typedef __s64	Elf64_Sxword;
 #define PT_HIOS    0x6fffffff      /* OS-specific */
 #define PT_LOPROC  0x70000000
 #define PT_HIPROC  0x7fffffff
-#define PT_GNU_EH_FRAME		0x6474e550
-#define PT_GNU_PROPERTY		0x6474e553
-
+#define PT_GNU_EH_FRAME	(PT_LOOS + 0x474e550)
 #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+#define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
+#define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
+
 
 /*
  * Extended Numbering
-- 
cgit v1.2.3


From dd0ca255f3d27a1bb43d8e9529fb3645f9a341a3 Mon Sep 17 00:00:00 2001
From: Daniel Braunwarth <daniel@braunwarth.dev>
Date: Mon, 28 Feb 2022 14:30:28 +0100
Subject: if_ether.h: add PROFINET Ethertype

Add the Ethertype for PROFINET protocol.

Signed-off-by: Daniel Braunwarth <daniel@braunwarth.dev>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index c0c2f3ed5729..4f4ed35a16db 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -86,6 +86,7 @@
 					 * over Ethernet
 					 */
 #define ETH_P_PAE	0x888E		/* Port Access Entity (IEEE 802.1X) */
+#define ETH_P_PROFINET	0x8892		/* PROFINET			*/
 #define ETH_P_REALTEK	0x8899          /* Multiple proprietary protocols */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
-- 
cgit v1.2.3


From cd73cda742fbe1f33ed7306c7a01aa64f4e6ebd5 Mon Sep 17 00:00:00 2001
From: Daniel Braunwarth <daniel@braunwarth.dev>
Date: Mon, 28 Feb 2022 14:30:29 +0100
Subject: if_ether.h: add EtherCAT Ethertype

Add the Ethertype for EtherCAT protocol.

Signed-off-by: Daniel Braunwarth <daniel@braunwarth.dev>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 4f4ed35a16db..1d0bccc3fa54 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -89,6 +89,7 @@
 #define ETH_P_PROFINET	0x8892		/* PROFINET			*/
 #define ETH_P_REALTEK	0x8899          /* Multiple proprietary protocols */
 #define ETH_P_AOE	0x88A2		/* ATA over Ethernet		*/
+#define ETH_P_ETHERCAT	0x88A4		/* EtherCAT			*/
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
 #define ETH_P_PREAUTH	0x88C7		/* 802.11 Preauthentication */
-- 
cgit v1.2.3


From d58b8a99cbb84c1eb3b3613d23c1a328695a9455 Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Fri, 25 Feb 2022 12:33:51 -0500
Subject: drm/amdkfd: Add SMI add event helper

To remove duplicate code, unify event message format and simplify new
event add in the following patches.

Use KFD_SMI_EVENT_MSG_SIZE to define msg size, the same size will be
used in user space to alloc the msg receive buffer.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 68 ++++++++++-------------------
 include/uapi/linux/kfd_ioctl.h              |  1 +
 2 files changed, 23 insertions(+), 46 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 1321fe91a1cf..bba8d9692a4c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -176,22 +176,28 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event,
 	rcu_read_unlock();
 }
 
-void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
+static void kfd_smi_event_add(struct kfd_dev *dev, unsigned int event,
+			      char *fmt, ...)
 {
-	/*
-	 * GpuReset msg = Reset seq number (incremented for
-	 * every reset message sent before GPU reset).
-	 * 1 byte event + 1 byte space + 8 bytes seq num +
-	 * 1 byte \n + 1 byte \0 = 12
-	 */
-	char fifo_in[12];
+	char fifo_in[KFD_SMI_EVENT_MSG_SIZE];
 	int len;
-	unsigned int event;
+	va_list args;
 
 	if (list_empty(&dev->smi_clients))
 		return;
 
-	memset(fifo_in, 0x0, sizeof(fifo_in));
+	len = snprintf(fifo_in, sizeof(fifo_in), "%x ", event);
+
+	va_start(args, fmt);
+	len += vsnprintf(fifo_in + len, sizeof(fifo_in) - len, fmt, args);
+	va_end(args);
+
+	add_event_to_kfifo(dev, event, fifo_in, len);
+}
+
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
+{
+	unsigned int event;
 
 	if (post_reset) {
 		event = KFD_SMI_EVENT_GPU_POST_RESET;
@@ -199,48 +205,20 @@ void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
 		event = KFD_SMI_EVENT_GPU_PRE_RESET;
 		++(dev->reset_seq_num);
 	}
-
-	len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event,
-						dev->reset_seq_num);
-
-	add_event_to_kfifo(dev, event, fifo_in, len);
+	kfd_smi_event_add(dev, event, "%x\n", dev->reset_seq_num);
 }
 
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 					     uint64_t throttle_bitmask)
 {
-	/*
-	 * ThermalThrottle msg = throttle_bitmask(8):
-	 * 			 thermal_interrupt_count(16):
-	 * 1 byte event + 1 byte space + 16 byte throttle_bitmask +
-	 * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n +
-	 * 1 byte \0 = 37
-	 */
-	char fifo_in[37];
-	int len;
-
-	if (list_empty(&dev->smi_clients))
-		return;
-
-	len = snprintf(fifo_in, sizeof(fifo_in), "%x %llx:%llx\n",
-		       KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
-		       amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
-
-	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE,	fifo_in, len);
+	kfd_smi_event_add(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
+			  throttle_bitmask,
+			  amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
 }
 
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 {
 	struct amdgpu_task_info task_info;
-	/* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */
-	/* 1 byte event + 1 byte space + 25 bytes msg + 1 byte \n +
-	 * 1 byte \0 = 29
-	 */
-	char fifo_in[29];
-	int len;
-
-	if (list_empty(&dev->smi_clients))
-		return;
 
 	memset(&task_info, 0, sizeof(struct amdgpu_task_info));
 	amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
@@ -248,10 +226,8 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 	if (!task_info.pid)
 		return;
 
-	len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
-		task_info.pid, task_info.task_name);
-
-	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
+	kfd_smi_event_add(dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
+			  task_info.pid, task_info.task_name);
 }
 
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index baec5a41de3e..b40687bf1014 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -463,6 +463,7 @@ enum kfd_smi_event {
 };
 
 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
+#define KFD_SMI_EVENT_MSG_SIZE	96
 
 struct kfd_ioctl_smi_events_args {
 	__u32 gpuid;	/* to KFD */
-- 
cgit v1.2.3


From 46efc97b73060823fdc18103a5e317a8327d44e1 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 2 Mar 2022 18:31:17 +0200
Subject: net: rtnetlink: RTM_GETSTATS: Allow filtering inside nests

The filter_mask field of RTM_GETSTATS header determines which top-level
attributes should be included in the netlink response. This saves
processing time by only including the bits that the user cares about
instead of always dumping everything. This is doubly important for
HW-backed statistics that would typically require a trip to the device to
fetch the stats.

So far there was only one HW-backed stat suite per attribute. However,
IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest, and will gain a new stat suite in
the following patches. It would therefore be advantageous to be able to
filter within that nest, and select just one or the other HW-backed
statistics suite.

Extend rtnetlink so that RTM_GETSTATS permits attributes in the payload.
The scheme is as follows:

    - RTM_GETSTATS
	- struct if_stats_msg
	- attr nest IFLA_STATS_GET_FILTERS
	    - attr IFLA_STATS_LINK_OFFLOAD_XSTATS
		- u32 filter_mask

This scheme reuses the existing enumerators by nesting them in a dedicated
context attribute. This is covered by policies as usual, therefore a
gradual opt-in is possible. Currently only IFLA_STATS_LINK_OFFLOAD_XSTATS
nest has filtering enabled, because for the SW counters the issue does not
seem to be that important.

rtnl_offload_xstats_get_size() and _fill() are extended to observe the
requested filters.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  10 +++
 net/core/rtnetlink.c         | 141 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 128 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e315e53125f4..4d62ea6e1288 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1207,6 +1207,16 @@ enum {
 
 #define IFLA_STATS_FILTER_BIT(ATTR)	(1 << (ATTR - 1))
 
+enum {
+	IFLA_STATS_GETSET_UNSPEC,
+	IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with
+				 * a filter mask for the corresponding group.
+				 */
+	__IFLA_STATS_GETSET_MAX,
+};
+
+#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1)
+
 /* These are embedded into IFLA_STATS_LINK_XSTATS:
  * [IFLA_STATS_LINK_XSTATS]
  * -> [LINK_XSTATS_TYPE_xxx]
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ad858799fd93..31aa26062070 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -5092,13 +5092,15 @@ rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
 }
 
 static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
-				    int *prividx)
+				    int *prividx, u32 off_filter_mask)
 {
 	int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
 	bool have_data = false;
 	int err;
 
-	if (*prividx <= attr_id_cpu_hit) {
+	if (*prividx <= attr_id_cpu_hit &&
+	    (off_filter_mask &
+	     IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
 		err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
 		if (!err) {
 			have_data = true;
@@ -5115,14 +5117,18 @@ static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
 	return 0;
 }
 
-static int rtnl_offload_xstats_get_size(const struct net_device *dev)
+static int rtnl_offload_xstats_get_size(const struct net_device *dev,
+					u32 off_filter_mask)
 {
 	int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
 	int nla_size = 0;
 	int size;
 
-	size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
-	nla_size += nla_total_size_64bit(size);
+	if (off_filter_mask &
+	    IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
+		size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
+		nla_size += nla_total_size_64bit(size);
+	}
 
 	if (nla_size != 0)
 		nla_size += nla_total_size(0);
@@ -5130,11 +5136,20 @@ static int rtnl_offload_xstats_get_size(const struct net_device *dev)
 	return nla_size;
 }
 
+struct rtnl_stats_dump_filters {
+	/* mask[0] filters outer attributes. Then individual nests have their
+	 * filtering mask at the index of the nested attribute.
+	 */
+	u32 mask[IFLA_STATS_MAX + 1];
+};
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
-			       unsigned int flags, unsigned int filter_mask,
+			       unsigned int flags,
+			       const struct rtnl_stats_dump_filters *filters,
 			       int *idxattr, int *prividx)
 {
+	unsigned int filter_mask = filters->mask[0];
 	struct if_stats_msg *ifsm;
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
@@ -5210,13 +5225,17 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
 			     *idxattr)) {
+		u32 off_filter_mask;
+
+		off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
 		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
 		attr = nla_nest_start_noflag(skb,
 					     IFLA_STATS_LINK_OFFLOAD_XSTATS);
 		if (!attr)
 			goto nla_put_failure;
 
-		err = rtnl_offload_xstats_fill(skb, dev, prividx);
+		err = rtnl_offload_xstats_fill(skb, dev, prividx,
+					       off_filter_mask);
 		if (err == -ENODATA)
 			nla_nest_cancel(skb, attr);
 		else
@@ -5281,9 +5300,10 @@ nla_put_failure:
 }
 
 static size_t if_nlmsg_stats_size(const struct net_device *dev,
-				  u32 filter_mask)
+				  const struct rtnl_stats_dump_filters *filters)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+	unsigned int filter_mask = filters->mask[0];
 
 	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
 		size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
@@ -5319,8 +5339,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
-	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
-		size += rtnl_offload_xstats_get_size(dev);
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
+		u32 off_filter_mask;
+
+		off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+		size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
+	}
 
 	if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
 		struct rtnl_af_ops *af_ops;
@@ -5344,6 +5368,74 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 	return size;
 }
 
+#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)
+
+static const struct nla_policy
+rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
+	[IFLA_STATS_LINK_OFFLOAD_XSTATS] =
+		    NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
+};
+
+static const struct nla_policy
+rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
+	[IFLA_STATS_GET_FILTERS] =
+		    NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
+};
+
+static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
+					struct rtnl_stats_dump_filters *filters,
+					struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_STATS_MAX + 1];
+	int err;
+	int at;
+
+	err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
+			       rtnl_stats_get_policy_filters, extack);
+	if (err < 0)
+		return err;
+
+	for (at = 1; at <= IFLA_STATS_MAX; at++) {
+		if (tb[at]) {
+			if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
+				NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
+				return -EINVAL;
+			}
+			filters->mask[at] = nla_get_u32(tb[at]);
+		}
+	}
+
+	return 0;
+}
+
+static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
+				u32 filter_mask,
+				struct rtnl_stats_dump_filters *filters,
+				struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+	int err;
+	int i;
+
+	filters->mask[0] = filter_mask;
+	for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
+		filters->mask[i] = -1U;
+
+	err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
+			  IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_STATS_GET_FILTERS]) {
+		err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
+						   filters, extack);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
 				bool is_dump, struct netlink_ext_ack *extack)
 {
@@ -5366,10 +5458,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
 		NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
 		return -EINVAL;
 	}
-	if (nlmsg_attrlen(nlh, sizeof(*ifsm))) {
-		NL_SET_ERR_MSG(extack, "Invalid attributes after stats header");
-		return -EINVAL;
-	}
 	if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
 		NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
 		return -EINVAL;
@@ -5381,12 +5469,12 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
 static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
 			  struct netlink_ext_ack *extack)
 {
+	struct rtnl_stats_dump_filters filters;
 	struct net *net = sock_net(skb->sk);
 	struct net_device *dev = NULL;
 	int idxattr = 0, prividx = 0;
 	struct if_stats_msg *ifsm;
 	struct sk_buff *nskb;
-	u32 filter_mask;
 	int err;
 
 	err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
@@ -5403,19 +5491,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!dev)
 		return -ENODEV;
 
-	filter_mask = ifsm->filter_mask;
-	if (!filter_mask) {
+	if (!ifsm->filter_mask) {
 		NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
 		return -EINVAL;
 	}
 
-	nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+	err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
+	if (err)
+		return err;
+
+	nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
 	if (!nskb)
 		return -ENOBUFS;
 
 	err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
 				  NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
-				  0, filter_mask, &idxattr, &prividx);
+				  0, &filters, &idxattr, &prividx);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -5431,12 +5522,12 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct netlink_ext_ack *extack = cb->extack;
 	int h, s_h, err, s_idx, s_idxattr, s_prividx;
+	struct rtnl_stats_dump_filters filters;
 	struct net *net = sock_net(skb->sk);
 	unsigned int flags = NLM_F_MULTI;
 	struct if_stats_msg *ifsm;
 	struct hlist_head *head;
 	struct net_device *dev;
-	u32 filter_mask = 0;
 	int idx = 0;
 
 	s_h = cb->args[0];
@@ -5451,12 +5542,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		return err;
 
 	ifsm = nlmsg_data(cb->nlh);
-	filter_mask = ifsm->filter_mask;
-	if (!filter_mask) {
+	if (!ifsm->filter_mask) {
 		NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
 		return -EINVAL;
 	}
 
+	err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
+				   extack);
+	if (err)
+		return err;
+
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
 		head = &net->dev_index_head[h];
@@ -5466,7 +5561,7 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
 						  NETLINK_CB(cb->skb).portid,
 						  cb->nlh->nlmsg_seq, 0,
-						  flags, filter_mask,
+						  flags, &filters,
 						  &s_idxattr, &s_prividx);
 			/* If we ran out of room on the first message,
 			 * we're in trouble
-- 
cgit v1.2.3


From 9309f97aef6d8250bb484dabeac925c3a7c57716 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 2 Mar 2022 18:31:20 +0200
Subject: net: dev: Add hardware stats support

Offloading switch device drivers may be able to collect statistics of the
traffic taking place in the HW datapath that pertains to a certain soft
netdevice, such as VLAN. Add the necessary infrastructure to allow exposing
these statistics to the offloaded netdevice in question. The API was shaped
by the following considerations:

- Collection of HW statistics is not free: there may be a finite number of
  counters, and the act of counting may have a performance impact. It is
  therefore necessary to allow toggling whether HW counting should be done
  for any particular SW netdevice.

- As the drivers are loaded and removed, a particular device may get
  offloaded and unoffloaded again. At the same time, the statistics values
  need to stay monotonic (modulo the eventual 64-bit wraparound),
  increasing only to reflect traffic measured in the device.

  To that end, the netdevice keeps around a lazily-allocated copy of struct
  rtnl_link_stats64. Device drivers then contribute to the values kept
  therein at various points. Even as the driver goes away, the struct stays
  around to maintain the statistics values.

- Different HW devices may be able to count different things. The
  motivation behind this patch in particular is exposure of HW counters on
  Nvidia Spectrum switches, where the only practical approach to counting
  traffic on offloaded soft netdevices currently is to use router interface
  counters, and count L3 traffic. Correspondingly that is the statistics
  suite added in this patch.

  Other devices may be able to measure different kinds of traffic, and for
  that reason, the APIs are built to allow uniform access to different
  statistics suites.

- Because soft netdevices and offloading drivers are only loosely bound, a
  netdevice uses a notifier chain to communicate with the drivers. Several
  new notifiers, NETDEV_OFFLOAD_XSTATS_*, have been added to carry messages
  to the offloading drivers.

- Devices can have various conditions for when a particular counter is
  available. As the device is configured and reconfigured, the device
  offload may become or cease being suitable for counter binding. A
  netdevice can use a notifier type NETDEV_OFFLOAD_XSTATS_REPORT_USED to
  ping offloading drivers and determine whether anyone currently implements
  a given statistics suite. This information can then be propagated to user
  space.

  When the driver decides to unoffload a netdevice, it can use a
  newly-added function, netdev_offload_xstats_report_delta(), to record
  outstanding collected statistics, before destroying the HW counter.

This patch adds a helper, call_netdevice_notifiers_info_robust(), for
dispatching a notifier with the possibility of unwind when one of the
consumers bails. Given the wish to eventually get rid of the global
notifier block altogether, this helper only invokes the per-netns notifier
block.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  42 +++++++
 include/uapi/linux/if_link.h |  15 +++
 net/core/dev.c               | 267 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 323 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79ee2296296..19a27ac361ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type {
  *	@watchdog_dev_tracker:	refcount tracker used by watchdog.
  *	@dev_registered_tracker:	tracker for reference held while
  *					registered
+ *	@offload_xstats_l3:	L3 HW stats for this netdevice.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2287,6 +2288,7 @@ struct net_device {
 	netdevice_tracker	linkwatch_dev_tracker;
 	netdevice_tracker	watchdog_dev_tracker;
 	netdevice_tracker	dev_registered_tracker;
+	struct rtnl_hw_stats64	*offload_xstats_l3;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -2727,6 +2729,10 @@ enum netdev_cmd {
 	NETDEV_CVLAN_FILTER_DROP_INFO,
 	NETDEV_SVLAN_FILTER_PUSH_INFO,
 	NETDEV_SVLAN_FILTER_DROP_INFO,
+	NETDEV_OFFLOAD_XSTATS_ENABLE,
+	NETDEV_OFFLOAD_XSTATS_DISABLE,
+	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2777,6 +2783,42 @@ struct netdev_notifier_pre_changeaddr_info {
 	const unsigned char *dev_addr;
 };
 
+enum netdev_offload_xstats_type {
+	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
+};
+
+struct netdev_notifier_offload_xstats_info {
+	struct netdev_notifier_info info; /* must be first */
+	enum netdev_offload_xstats_type type;
+
+	union {
+		/* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */
+		struct netdev_notifier_offload_xstats_rd *report_delta;
+		/* NETDEV_OFFLOAD_XSTATS_REPORT_USED */
+		struct netdev_notifier_offload_xstats_ru *report_used;
+	};
+};
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+				 enum netdev_offload_xstats_type type,
+				 struct netlink_ext_ack *extack);
+int netdev_offload_xstats_disable(struct net_device *dev,
+				  enum netdev_offload_xstats_type type);
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+				   enum netdev_offload_xstats_type type);
+int netdev_offload_xstats_get(struct net_device *dev,
+			      enum netdev_offload_xstats_type type,
+			      struct rtnl_hw_stats64 *stats, bool *used,
+			      struct netlink_ext_ack *extack);
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd,
+				   const struct rtnl_hw_stats64 *stats);
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru);
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+				      enum netdev_offload_xstats_type type,
+				      const struct rtnl_hw_stats64 *stats);
+
 static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4d62ea6e1288..ef6a62a2e15d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -245,6 +245,21 @@ struct rtnl_link_stats64 {
 	__u64	rx_nohandler;
 };
 
+/* Subset of link stats useful for in-HW collection. Meaning of the fields is as
+ * for struct rtnl_link_stats64.
+ */
+struct rtnl_hw_stats64 {
+	__u64	rx_packets;
+	__u64	tx_packets;
+	__u64	rx_bytes;
+	__u64	tx_bytes;
+	__u64	rx_errors;
+	__u64	tx_errors;
+	__u64	rx_dropped;
+	__u64	tx_dropped;
+	__u64	multicast;
+};
+
 /* The struct should be in sync with struct ifmap */
 struct rtnl_link_ifmap {
 	__u64	mem_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2d6771075720..c9e54e5ad48d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1622,7 +1622,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
-	N(PRE_CHANGEADDR)
+	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
@@ -1939,6 +1940,32 @@ static int call_netdevice_notifiers_info(unsigned long val,
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
+/**
+ *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ *	                                       for and rollback on error
+ *	@val_up: value passed unmodified to notifier function
+ *	@val_down: value passed unmodified to the notifier function when
+ *	           recovering from an error on @val_up
+ *	@info: notifier information data
+ *
+ *	Call all per-netns network notifier blocks, but not notifier blocks on
+ *	the global notifier chain. Parameters and return value are as for
+ *	raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+				     unsigned long val_down,
+				     struct netdev_notifier_info *info)
+{
+	struct net *net = dev_net(info->dev);
+
+	ASSERT_RTNL();
+
+	return raw_notifier_call_chain_robust(&net->netdev_chain,
+					      val_up, val_down, info);
+}
+
 static int call_netdevice_notifiers_extack(unsigned long val,
 					   struct net_device *dev,
 					   struct netlink_ext_ack *extack)
@@ -7728,6 +7755,242 @@ void netdev_bonding_info_change(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
 
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+					   struct netlink_ext_ack *extack)
+{
+	struct netdev_notifier_offload_xstats_info info = {
+		.info.dev = dev,
+		.info.extack = extack,
+		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+	};
+	int err;
+	int rc;
+
+	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+					 GFP_KERNEL);
+	if (!dev->offload_xstats_l3)
+		return -ENOMEM;
+
+	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+						  NETDEV_OFFLOAD_XSTATS_DISABLE,
+						  &info.info);
+	err = notifier_to_errno(rc);
+	if (err)
+		goto free_stats;
+
+	return 0;
+
+free_stats:
+	kfree(dev->offload_xstats_l3);
+	dev->offload_xstats_l3 = NULL;
+	return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+				 enum netdev_offload_xstats_type type,
+				 struct netlink_ext_ack *extack)
+{
+	ASSERT_RTNL();
+
+	if (netdev_offload_xstats_enabled(dev, type))
+		return -EALREADY;
+
+	switch (type) {
+	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+		return netdev_offload_xstats_enable_l3(dev, extack);
+	}
+
+	WARN_ON(1);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+	struct netdev_notifier_offload_xstats_info info = {
+		.info.dev = dev,
+		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+	};
+
+	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+				      &info.info);
+	kfree(dev->offload_xstats_l3);
+	dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+				  enum netdev_offload_xstats_type type)
+{
+	ASSERT_RTNL();
+
+	if (!netdev_offload_xstats_enabled(dev, type))
+		return -EALREADY;
+
+	switch (type) {
+	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+		netdev_offload_xstats_disable_l3(dev);
+		return 0;
+	}
+
+	WARN_ON(1);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+			      enum netdev_offload_xstats_type type)
+{
+	switch (type) {
+	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+		return dev->offload_xstats_l3;
+	}
+
+	WARN_ON(1);
+	return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+				   enum netdev_offload_xstats_type type)
+{
+	ASSERT_RTNL();
+
+	return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+	bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+	struct rtnl_hw_stats64 stats;
+	bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+				  const struct rtnl_hw_stats64 *src)
+{
+	dest->rx_packets	  += src->rx_packets;
+	dest->tx_packets	  += src->tx_packets;
+	dest->rx_bytes		  += src->rx_bytes;
+	dest->tx_bytes		  += src->tx_bytes;
+	dest->rx_errors		  += src->rx_errors;
+	dest->tx_errors		  += src->tx_errors;
+	dest->rx_dropped	  += src->rx_dropped;
+	dest->tx_dropped	  += src->tx_dropped;
+	dest->multicast		  += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+					  enum netdev_offload_xstats_type type,
+					  bool *p_used,
+					  struct netlink_ext_ack *extack)
+{
+	struct netdev_notifier_offload_xstats_ru report_used = {};
+	struct netdev_notifier_offload_xstats_info info = {
+		.info.dev = dev,
+		.info.extack = extack,
+		.type = type,
+		.report_used = &report_used,
+	};
+	int rc;
+
+	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+					   &info.info);
+	*p_used = report_used.used;
+	return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+					   enum netdev_offload_xstats_type type,
+					   struct rtnl_hw_stats64 *p_stats,
+					   bool *p_used,
+					   struct netlink_ext_ack *extack)
+{
+	struct netdev_notifier_offload_xstats_rd report_delta = {};
+	struct netdev_notifier_offload_xstats_info info = {
+		.info.dev = dev,
+		.info.extack = extack,
+		.type = type,
+		.report_delta = &report_delta,
+	};
+	struct rtnl_hw_stats64 *stats;
+	int rc;
+
+	stats = netdev_offload_xstats_get_ptr(dev, type);
+	if (WARN_ON(!stats))
+		return -EINVAL;
+
+	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+					   &info.info);
+
+	/* Cache whatever we got, even if there was an error, otherwise the
+	 * successful stats retrievals would get lost.
+	 */
+	netdev_hw_stats64_add(stats, &report_delta.stats);
+
+	if (p_stats)
+		*p_stats = *stats;
+	*p_used = report_delta.used;
+
+	return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+			      enum netdev_offload_xstats_type type,
+			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
+			      struct netlink_ext_ack *extack)
+{
+	ASSERT_RTNL();
+
+	if (p_stats)
+		return netdev_offload_xstats_get_stats(dev, type, p_stats,
+						       p_used, extack);
+	else
+		return netdev_offload_xstats_get_used(dev, type, p_used,
+						      extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+				   const struct rtnl_hw_stats64 *stats)
+{
+	report_delta->used = true;
+	netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+	report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+				      enum netdev_offload_xstats_type type,
+				      const struct rtnl_hw_stats64 *p_stats)
+{
+	struct rtnl_hw_stats64 *stats;
+
+	ASSERT_RTNL();
+
+	stats = netdev_offload_xstats_get_ptr(dev, type);
+	if (WARN_ON(!stats))
+		return;
+
+	netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
 /**
  * netdev_get_xmit_slave - Get the xmit slave of master device
  * @dev: device
@@ -10417,6 +10680,8 @@ void unregister_netdevice_many(struct list_head *head)
 
 		dev_xdp_uninstall(dev);
 
+		netdev_offload_xstats_disable_all(dev);
+
 		/* Notify protocols, that we are about to destroy
 		 * this device. They should clean all the things.
 		 */
-- 
cgit v1.2.3


From 0e7788fd76222dba3229eada9162efab185923fc Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 2 Mar 2022 18:31:21 +0200
Subject: net: rtnetlink: Add UAPI for obtaining L3 offload xstats

Add a new IFLA_STATS_LINK_OFFLOAD_XSTATS child attribute,
IFLA_OFFLOAD_XSTATS_L3_STATS, to carry statistics for traffic that takes
place in a HW router.

The offloaded HW stats are designed to allow per-netdevice enablement and
disablement. Additionally, as a netdevice is configured, it may become or
cease being suitable for binding of a HW counter. Both of these aspects
need to be communicated to the userspace. To that end, add another child
attribute, IFLA_OFFLOAD_XSTATS_HW_S_INFO:

    - attr nest IFLA_OFFLOAD_XSTATS_HW_S_INFO
	- attr nest IFLA_OFFLOAD_XSTATS_L3_STATS
 	    - attr IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST
	      - {0,1} as u8
 	    - attr IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED
	      - {0,1} as u8

Thus this one attribute is a nest that can be used to carry information
about various types of HW statistics, and indexing is very simply done by
wrapping the information for a given statistics suite into the attribute
that carries the suite is the RTM_GETSTATS query. At the same time, because
_HW_S_INFO is nested directly below IFLA_STATS_LINK_OFFLOAD_XSTATS, it is
possible through filtering to request only the metadata about individual
statistics suites, without having to hit the HW to get the actual counters.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  11 +++
 net/core/rtnetlink.c         | 170 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ef6a62a2e15d..b1031f481d2f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1249,10 +1249,21 @@ enum {
 enum {
 	IFLA_OFFLOAD_XSTATS_UNSPEC,
 	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	IFLA_OFFLOAD_XSTATS_HW_S_INFO,	/* HW stats info. A nest */
+	IFLA_OFFLOAD_XSTATS_L3_STATS,	/* struct rtnl_hw_stats64 */
 	__IFLA_OFFLOAD_XSTATS_MAX
 };
 #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
 
+enum {
+	IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST,		/* u8 */
+	IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED,		/* u8 */
+	__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX,
+};
+#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \
+	(__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1)
+
 /* XDP section */
 
 #define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4db1d6c01a7d..9ce894a9454c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -5091,10 +5091,110 @@ rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
 	return 0;
 }
 
+static unsigned int
+rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
+				   enum netdev_offload_xstats_type type)
+{
+	bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+	return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
+}
+
+struct rtnl_offload_xstats_request_used {
+	bool request;
+	bool used;
+};
+
+static int
+rtnl_offload_xstats_get_stats(struct net_device *dev,
+			      enum netdev_offload_xstats_type type,
+			      struct rtnl_offload_xstats_request_used *ru,
+			      struct rtnl_hw_stats64 *stats,
+			      struct netlink_ext_ack *extack)
+{
+	bool request;
+	bool used;
+	int err;
+
+	request = netdev_offload_xstats_enabled(dev, type);
+	if (!request) {
+		used = false;
+		goto out;
+	}
+
+	err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
+	if (err)
+		return err;
+
+out:
+	if (ru) {
+		ru->request = request;
+		ru->used = used;
+	}
+	return 0;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
+				       struct rtnl_offload_xstats_request_used *ru)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, attr_id);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
+				   struct netlink_ext_ack *extack)
+{
+	enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+	struct rtnl_offload_xstats_request_used ru_l3;
+	struct nlattr *nest;
+	int err;
+
+	err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
+	if (err)
+		return err;
+
+	nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
+						   IFLA_OFFLOAD_XSTATS_L3_STATS,
+						   &ru_l3))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
 				    int *prividx, u32 off_filter_mask,
 				    struct netlink_ext_ack *extack)
 {
+	enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+	int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
+	int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
 	int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
 	bool have_data = false;
 	int err;
@@ -5111,6 +5211,40 @@ static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (*prividx <= attr_id_hw_s_info &&
+	    (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
+		*prividx = attr_id_hw_s_info;
+
+		err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
+		if (err)
+			return err;
+
+		have_data = true;
+		*prividx = 0;
+	}
+
+	if (*prividx <= attr_id_l3_stats &&
+	    (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
+		unsigned int size_l3;
+		struct nlattr *attr;
+
+		*prividx = attr_id_l3_stats;
+
+		size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+		attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			return -EMSGSIZE;
+
+		err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
+						    nla_data(attr), extack);
+		if (err)
+			return err;
+
+		have_data = true;
+		*prividx = 0;
+	}
+
 	if (!have_data)
 		return -ENODATA;
 
@@ -5118,9 +5252,35 @@ static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
 	return 0;
 }
 
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
+					   enum netdev_offload_xstats_type type)
+{
+	bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+	return nla_total_size(0) +
+		/* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
+		nla_total_size(sizeof(u8)) +
+		/* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
+		(enabled ? nla_total_size(sizeof(u8)) : 0) +
+		0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
+{
+	enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+
+	return nla_total_size(0) +
+		/* IFLA_OFFLOAD_XSTATS_L3_STATS */
+		rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
+		0;
+}
+
 static int rtnl_offload_xstats_get_size(const struct net_device *dev,
 					u32 off_filter_mask)
 {
+	enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
 	int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
 	int nla_size = 0;
 	int size;
@@ -5131,6 +5291,16 @@ static int rtnl_offload_xstats_get_size(const struct net_device *dev,
 		nla_size += nla_total_size_64bit(size);
 	}
 
+	if (off_filter_mask &
+	    IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
+		nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);
+
+	if (off_filter_mask &
+	    IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
+		size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+		nla_size += nla_total_size_64bit(size);
+	}
+
 	if (nla_size != 0)
 		nla_size += nla_total_size(0);
 
-- 
cgit v1.2.3


From 03ba35667091337d8e632cf4b814f1c1b914609b Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 2 Mar 2022 18:31:22 +0200
Subject: net: rtnetlink: Add RTM_SETSTATS

The offloaded HW stats are designed to allow per-netdevice enablement and
disablement. These stats are only accessible through RTM_GETSTATS, and
therefore should be toggled by a RTM_SETSTATS message. Add it, and the
necessary skeleton handler.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  2 ++
 net/core/rtnetlink.c           | 66 ++++++++++++++++++++++++++++++++++++++++++
 security/selinux/nlmsgtab.c    |  1 +
 3 files changed, 69 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0970cb4b1b88..14462dc159fd 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -146,6 +146,8 @@ enum {
 #define RTM_NEWSTATS RTM_NEWSTATS
 	RTM_GETSTATS = 94,
 #define RTM_GETSTATS RTM_GETSTATS
+	RTM_SETSTATS,
+#define RTM_SETSTATS RTM_SETSTATS
 
 	RTM_NEWCACHEREPORT = 96,
 #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9ce894a9454c..d09354514355 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -5564,6 +5564,10 @@ rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
 		    NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
 };
 
+static const struct nla_policy
+ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+};
+
 static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
 					struct rtnl_stats_dump_filters *filters,
 					struct netlink_ext_ack *extack)
@@ -5769,6 +5773,67 @@ out:
 	return skb->len;
 }
 
+static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct rtnl_stats_dump_filters response_filters = {};
+	struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev = NULL;
+	int idxattr = 0, prividx = 0;
+	struct if_stats_msg *ifsm;
+	struct sk_buff *nskb;
+	int err;
+
+	err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+				   false, extack);
+	if (err)
+		return err;
+
+	ifsm = nlmsg_data(nlh);
+	if (ifsm->family != AF_UNSPEC) {
+		NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
+		return -EINVAL;
+	}
+
+	if (ifsm->ifindex > 0)
+		dev = __dev_get_by_index(net, ifsm->ifindex);
+	else
+		return -EINVAL;
+
+	if (!dev)
+		return -ENODEV;
+
+	if (ifsm->filter_mask) {
+		NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
+		return -EINVAL;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
+			  ifla_stats_set_policy, extack);
+	if (err < 0)
+		return err;
+
+	nskb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+			 GFP_KERNEL);
+	if (!nskb)
+		return -ENOBUFS;
+
+	err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
+				  NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+				  0, &response_filters, &idxattr, &prividx,
+				  extack);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(nskb);
+	} else {
+		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+	}
+
+	return err;
+}
+
 /* Process one rtnetlink message. */
 
 static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -5994,4 +6059,5 @@ void __init rtnetlink_init(void)
 
 	rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
 		      0);
+	rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
 }
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 6ad3ee02e023..d8ceee9e0d6f 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -76,6 +76,7 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_GETNSID,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 	{ RTM_NEWSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ },
 	{ RTM_GETSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_SETSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_NEWCACHEREPORT,	NETLINK_ROUTE_SOCKET__NLMSG_READ },
 	{ RTM_NEWCHAIN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELCHAIN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
-- 
cgit v1.2.3


From 5fd0b838efac16046509f7fb100455d0463b9687 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 2 Mar 2022 18:31:23 +0200
Subject: net: rtnetlink: Add UAPI toggle for IFLA_OFFLOAD_XSTATS_L3_STATS

The offloaded HW stats are designed to allow per-netdevice enablement and
disablement. Add an attribute, IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS,
which should be carried by the RTM_SETSTATS message, and expresses a desire
to toggle L3 offload xstats on or off.

As part of the above, add an exported function rtnl_offload_xstats_notify()
that drivers can use when they have installed or deinstalled the counters
backing the HW stats.

At this point, it is possible to enable, disable and query L3 offload
xstats on netdevices. (However there is no driver actually implementing
these.)

Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h      |  3 ++
 include/uapi/linux/if_link.h   |  1 +
 include/uapi/linux/rtnetlink.h |  2 ++
 net/core/rtnetlink.c           | 75 ++++++++++++++++++++++++++++++++----------
 4 files changed, 64 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bb9cb84114c1..7f970b16da3a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -134,4 +134,7 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   int (*vlan_fill)(struct sk_buff *skb,
 						    struct net_device *dev,
 						    u32 filter_mask));
+
+extern void rtnl_offload_xstats_notify(struct net_device *dev);
+
 #endif	/* __LINUX_RTNETLINK_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b1031f481d2f..ddca20357e7e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1227,6 +1227,7 @@ enum {
 	IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with
 				 * a filter mask for the corresponding group.
 				 */
+	IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */
 	__IFLA_STATS_GETSET_MAX,
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 14462dc159fd..51530aade46e 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -767,6 +767,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_MCTP_IFADDR	RTNLGRP_MCTP_IFADDR
 	RTNLGRP_TUNNEL,
 #define RTNLGRP_TUNNEL		RTNLGRP_TUNNEL
+	RTNLGRP_STATS,
+#define RTNLGRP_STATS		RTNLGRP_STATS
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d09354514355..a66b6761b88b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -5566,6 +5566,7 @@ rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
 
 static const struct nla_policy
 ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+	[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
@@ -5773,16 +5774,51 @@ out:
 	return skb->len;
 }
 
+void rtnl_offload_xstats_notify(struct net_device *dev)
+{
+	struct rtnl_stats_dump_filters response_filters = {};
+	struct net *net = dev_net(dev);
+	int idxattr = 0, prividx = 0;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	ASSERT_RTNL();
+
+	response_filters.mask[0] |=
+		IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+	response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+		IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+
+	skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+			GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
+				  &response_filters, &idxattr, &prividx, NULL);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
+	return;
+
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_STATS, err);
+}
+EXPORT_SYMBOL(rtnl_offload_xstats_notify);
+
 static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 			  struct netlink_ext_ack *extack)
 {
+	enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
 	struct rtnl_stats_dump_filters response_filters = {};
 	struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
 	struct net *net = sock_net(skb->sk);
 	struct net_device *dev = NULL;
-	int idxattr = 0, prividx = 0;
 	struct if_stats_msg *ifsm;
-	struct sk_buff *nskb;
+	bool notify = false;
 	int err;
 
 	err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
@@ -5814,24 +5850,29 @@ static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
-	nskb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
-			 GFP_KERNEL);
-	if (!nskb)
-		return -ENOBUFS;
+	if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
+		u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);
 
-	err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
-				  NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
-				  0, &response_filters, &idxattr, &prividx,
-				  extack);
-	if (err < 0) {
-		/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
-		WARN_ON(err == -EMSGSIZE);
-		kfree_skb(nskb);
-	} else {
-		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+		if (req)
+			err = netdev_offload_xstats_enable(dev, t_l3, extack);
+		else
+			err = netdev_offload_xstats_disable(dev, t_l3);
+
+		if (!err)
+			notify = true;
+		else if (err != -EALREADY)
+			return err;
+
+		response_filters.mask[0] |=
+			IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+			IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
 	}
 
-	return err;
+	if (notify)
+		rtnl_offload_xstats_notify(dev);
+
+	return 0;
 }
 
 /* Process one rtnetlink message. */
-- 
cgit v1.2.3


From 115dcec65f61d53e25e1bed5e380468b30f98b14 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 24 Feb 2022 16:20:18 +0200
Subject: vfio: Define device migration protocol v2

Replace the existing region based migration protocol with an ioctl based
protocol. The two protocols have the same general semantic behaviors, but
the way the data is transported is changed.

This is the STOP_COPY portion of the new protocol, it defines the 5 states
for basic stop and copy migration and the protocol to move the migration
data in/out of the kernel.

Compared to the clarification of the v1 protocol Alex proposed:

https://lore.kernel.org/r/163909282574.728533.7460416142511440919.stgit@omen

This has a few deliberate functional differences:

 - ERROR arcs allow the device function to remain unchanged.

 - The protocol is not required to return to the original state on
   transition failure. Instead userspace can execute an unwind back to
   the original state, reset, or do something else without needing kernel
   support. This simplifies the kernel design and should userspace choose
   a policy like always reset, avoids doing useless work in the kernel
   on error handling paths.

 - PRE_COPY is made optional, userspace must discover it before using it.
   This reflects the fact that the majority of drivers we are aware of
   right now will not implement PRE_COPY.

 - segmentation is not part of the data stream protocol, the receiver
   does not have to reproduce the framing boundaries.

The hybrid FSM for the device_state is described as a Mealy machine by
documenting each of the arcs the driver is required to implement. Defining
the remaining set of old/new device_state transitions as 'combination
transitions' which are naturally defined as taking multiple FSM arcs along
the shortest path within the FSM's digraph allows a complete matrix of
transitions.

A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE is
defined to replace writing to the device_state field in the region. This
allows returning a brand new FD whenever the requested transition opens
a data transfer session.

The VFIO core code implements the new feature and provides a helper
function to the driver. Using the helper the driver only has to
implement 6 of the FSM arcs and the other combination transitions are
elaborated consistently from those arcs.

A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIGRATION is defined to
report the capability for migration and indicate which set of states and
arcs are supported by the device. The FSM provides a lot of flexibility to
make backwards compatible extensions but the VFIO_DEVICE_FEATURE also
allows for future breaking extensions for scenarios that cannot support
even the basic STOP_COPY requirements.

The VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE with the GET option (i.e.
VFIO_DEVICE_FEATURE_GET) can be used to read the current migration state
of the VFIO device.

Data transfer sessions are now carried over a file descriptor, instead of
the region. The FD functions for the lifetime of the data transfer
session. read() and write() transfer the data with normal Linux stream FD
semantics. This design allows future expansion to support poll(),
io_uring, and other performance optimizations.

The complicated mmap mode for data transfer is discarded as current qemu
doesn't take meaningful advantage of it, and the new qemu implementation
avoids substantially all the performance penalty of using a read() on the
region.

Link: https://lore.kernel.org/all/20220224142024.147653-10-yishaih@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/vfio.c       | 199 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vfio.h      |  20 +++++
 include/uapi/linux/vfio.h | 174 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 380 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 71763e2ac561..b37ab27b511f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1557,6 +1557,197 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 	return 0;
 }
 
+/*
+ * vfio_mig_get_next_state - Compute the next step in the FSM
+ * @cur_fsm - The current state the device is in
+ * @new_fsm - The target state to reach
+ * @next_fsm - Pointer to the next step to get to new_fsm
+ *
+ * Return 0 upon success, otherwise -errno
+ * Upon success the next step in the state progression between cur_fsm and
+ * new_fsm will be set in next_fsm.
+ *
+ * This breaks down requests for combination transitions into smaller steps and
+ * returns the next step to get to new_fsm. The function may need to be called
+ * multiple times before reaching new_fsm.
+ *
+ */
+int vfio_mig_get_next_state(struct vfio_device *device,
+			    enum vfio_device_mig_state cur_fsm,
+			    enum vfio_device_mig_state new_fsm,
+			    enum vfio_device_mig_state *next_fsm)
+{
+	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RESUMING + 1 };
+	/*
+	 * The coding in this table requires the driver to implement 6
+	 * FSM arcs:
+	 *         RESUMING -> STOP
+	 *         RUNNING -> STOP
+	 *         STOP -> RESUMING
+	 *         STOP -> RUNNING
+	 *         STOP -> STOP_COPY
+	 *         STOP_COPY -> STOP
+	 *
+	 * The coding will step through multiple states for these combination
+	 * transitions:
+	 *         RESUMING -> STOP -> RUNNING
+	 *         RESUMING -> STOP -> STOP_COPY
+	 *         RUNNING -> STOP -> RESUMING
+	 *         RUNNING -> STOP -> STOP_COPY
+	 *         STOP_COPY -> STOP -> RESUMING
+	 *         STOP_COPY -> STOP -> RUNNING
+	 */
+	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
+		[VFIO_DEVICE_STATE_STOP] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RUNNING] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_STOP_COPY] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RESUMING] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_ERROR] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+	};
+
+	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table)))
+		return -EINVAL;
+
+	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table))
+		return -EINVAL;
+
+	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
+	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
+
+/*
+ * Convert the drivers's struct file into a FD number and return it to userspace
+ */
+static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
+				   struct vfio_device_feature_mig_state *mig)
+{
+	int ret;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
+		goto out_fput;
+	}
+
+	mig->data_fd = fd;
+	if (copy_to_user(arg, mig, sizeof(*mig))) {
+		ret = -EFAULT;
+		goto out_put_unused;
+	}
+	fd_install(fd, filp);
+	return 0;
+
+out_put_unused:
+	put_unused_fd(fd);
+out_fput:
+	fput(filp);
+	return ret;
+}
+
+static int
+vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
+					   u32 flags, void __user *arg,
+					   size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_mig_state, data_fd);
+	struct vfio_device_feature_mig_state mig;
+	struct file *filp = NULL;
+	int ret;
+
+	if (!device->ops->migration_set_state ||
+	    !device->ops->migration_get_state)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET |
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(mig));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&mig, arg, minsz))
+		return -EFAULT;
+
+	if (flags & VFIO_DEVICE_FEATURE_GET) {
+		enum vfio_device_mig_state curr_state;
+
+		ret = device->ops->migration_get_state(device, &curr_state);
+		if (ret)
+			return ret;
+		mig.device_state = curr_state;
+		goto out_copy;
+	}
+
+	/* Handle the VFIO_DEVICE_FEATURE_SET */
+	filp = device->ops->migration_set_state(device, mig.device_state);
+	if (IS_ERR(filp) || !filp)
+		goto out_copy;
+
+	return vfio_ioct_mig_return_fd(filp, arg, &mig);
+out_copy:
+	mig.data_fd = -1;
+	if (copy_to_user(arg, &mig, sizeof(mig)))
+		return -EFAULT;
+	if (IS_ERR(filp))
+		return PTR_ERR(filp);
+	return 0;
+}
+
+static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
+					       u32 flags, void __user *arg,
+					       size_t argsz)
+{
+	struct vfio_device_feature_migration mig = {
+		.flags = VFIO_MIGRATION_STOP_COPY,
+	};
+	int ret;
+
+	if (!device->ops->migration_set_state ||
+	    !device->ops->migration_get_state)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(mig));
+	if (ret != 1)
+		return ret;
+	if (copy_to_user(arg, &mig, sizeof(mig)))
+		return -EFAULT;
+	return 0;
+}
+
 static int vfio_ioctl_device_feature(struct vfio_device *device,
 				     struct vfio_device_feature __user *arg)
 {
@@ -1582,6 +1773,14 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
 		return -EINVAL;
 
 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+	case VFIO_DEVICE_FEATURE_MIGRATION:
+		return vfio_ioctl_device_feature_migration(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
+	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+		return vfio_ioctl_device_feature_mig_device_state(
+			device, feature.flags, arg->data,
+			feature.argsz - minsz);
 	default:
 		if (unlikely(!device->ops->device_feature))
 			return -EINVAL;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 550c28f2ef60..c44e80bbbd3b 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -56,6 +56,16 @@ struct vfio_device {
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
+ * @migration_set_state: Optional callback to change the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ *         The returned FD is used for data transfer according to the FSM
+ *         definition. The driver is responsible to ensure that FD reaches end
+ *         of stream or error whenever the migration FSM leaves a data transfer
+ *         state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
  */
 struct vfio_device_ops {
 	char	*name;
@@ -72,6 +82,11 @@ struct vfio_device_ops {
 	int	(*match)(struct vfio_device *vdev, char *buf);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
+	struct file *(*migration_set_state)(
+		struct vfio_device *device,
+		enum vfio_device_mig_state new_state);
+	int (*migration_get_state)(struct vfio_device *device,
+				   enum vfio_device_mig_state *curr_state);
 };
 
 /**
@@ -114,6 +129,11 @@ extern void vfio_device_put(struct vfio_device *device);
 
 int vfio_assign_device_set(struct vfio_device *device, void *set_id);
 
+int vfio_mig_get_next_state(struct vfio_device *device,
+			    enum vfio_device_mig_state cur_fsm,
+			    enum vfio_device_mig_state new_fsm,
+			    enum vfio_device_mig_state *next_fsm);
+
 /*
  * External user API
  */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..22ed358c04c5 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -605,25 +605,25 @@ struct vfio_region_gfx_edid {
 
 struct vfio_device_migration_info {
 	__u32 device_state;         /* VFIO device state */
-#define VFIO_DEVICE_STATE_STOP      (0)
-#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
-#define VFIO_DEVICE_STATE_SAVING    (1 << 1)
-#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
-#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_RUNNING | \
-				     VFIO_DEVICE_STATE_SAVING |  \
-				     VFIO_DEVICE_STATE_RESUMING)
+#define VFIO_DEVICE_STATE_V1_STOP      (0)
+#define VFIO_DEVICE_STATE_V1_RUNNING   (1 << 0)
+#define VFIO_DEVICE_STATE_V1_SAVING    (1 << 1)
+#define VFIO_DEVICE_STATE_V1_RESUMING  (1 << 2)
+#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_V1_RUNNING | \
+				     VFIO_DEVICE_STATE_V1_SAVING |  \
+				     VFIO_DEVICE_STATE_V1_RESUMING)
 
 #define VFIO_DEVICE_STATE_VALID(state) \
-	(state & VFIO_DEVICE_STATE_RESUMING ? \
-	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+	(state & VFIO_DEVICE_STATE_V1_RESUMING ? \
+	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_V1_RESUMING : 1)
 
 #define VFIO_DEVICE_STATE_IS_ERROR(state) \
-	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
-					      VFIO_DEVICE_STATE_RESUMING))
+	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_V1_SAVING | \
+					      VFIO_DEVICE_STATE_V1_RESUMING))
 
 #define VFIO_DEVICE_STATE_SET_ERROR(state) \
-	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
-					     VFIO_DEVICE_STATE_RESUMING)
+	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_STATE_V1_SAVING | \
+					     VFIO_DEVICE_STATE_V1_RESUMING)
 
 	__u32 reserved;
 	__u64 pending_bytes;
@@ -1002,6 +1002,154 @@ struct vfio_device_feature {
  */
 #define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN	(0)
 
+/*
+ * Indicates the device can support the migration API through
+ * VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. If this GET succeeds, the RUNNING and
+ * ERROR states are always supported. Support for additional states is
+ * indicated via the flags field; at least VFIO_MIGRATION_STOP_COPY must be
+ * set.
+ *
+ * VFIO_MIGRATION_STOP_COPY means that STOP, STOP_COPY and
+ * RESUMING are supported.
+ */
+struct vfio_device_feature_migration {
+	__aligned_u64 flags;
+#define VFIO_MIGRATION_STOP_COPY	(1 << 0)
+};
+#define VFIO_DEVICE_FEATURE_MIGRATION 1
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, execute a migration state change on the VFIO
+ * device. The new state is supplied in device_state, see enum
+ * vfio_device_mig_state for details
+ *
+ * The kernel migration driver must fully transition the device to the new state
+ * value before the operation returns to the user.
+ *
+ * The kernel migration driver must not generate asynchronous device state
+ * transitions outside of manipulation by the user or the VFIO_DEVICE_RESET
+ * ioctl as described above.
+ *
+ * If this function fails then current device_state may be the original
+ * operating state or some other state along the combination transition path.
+ * The user can then decide if it should execute a VFIO_DEVICE_RESET, attempt
+ * to return to the original state, or attempt to return to some other state
+ * such as RUNNING or STOP.
+ *
+ * If the new_state starts a new data transfer session then the FD associated
+ * with that session is returned in data_fd. The user is responsible to close
+ * this FD when it is finished. The user must consider the migration data stream
+ * carried over the FD to be opaque and must preserve the byte order of the
+ * stream. The user is not required to preserve buffer segmentation when writing
+ * the data stream during the RESUMING operation.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET, get the current migration state of the VFIO
+ * device, data_fd will be -1.
+ */
+struct vfio_device_feature_mig_state {
+	__u32 device_state; /* From enum vfio_device_mig_state */
+	__s32 data_fd;
+};
+#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
+
+/*
+ * The device migration Finite State Machine is described by the enum
+ * vfio_device_mig_state. Some of the FSM arcs will create a migration data
+ * transfer session by returning a FD, in this case the migration data will
+ * flow over the FD using read() and write() as discussed below.
+ *
+ * There are 5 states to support VFIO_MIGRATION_STOP_COPY:
+ *  RUNNING - The device is running normally
+ *  STOP - The device does not change the internal or external state
+ *  STOP_COPY - The device internal state can be read out
+ *  RESUMING - The device is stopped and is loading a new internal state
+ *  ERROR - The device has failed and must be reset
+ *
+ * The FSM takes actions on the arcs between FSM states. The driver implements
+ * the following behavior for the FSM arcs:
+ *
+ * RUNNING -> STOP
+ * STOP_COPY -> STOP
+ *   While in STOP the device must stop the operation of the device. The device
+ *   must not generate interrupts, DMA, or any other change to external state.
+ *   It must not change its internal state. When stopped the device and kernel
+ *   migration driver must accept and respond to interaction to support external
+ *   subsystems in the STOP state, for example PCI MSI-X and PCI config space.
+ *   Failure by the user to restrict device access while in STOP must not result
+ *   in error conditions outside the user context (ex. host system faults).
+ *
+ *   The STOP_COPY arc will terminate a data transfer session.
+ *
+ * RESUMING -> STOP
+ *   Leaving RESUMING terminates a data transfer session and indicates the
+ *   device should complete processing of the data delivered by write(). The
+ *   kernel migration driver should complete the incorporation of data written
+ *   to the data transfer FD into the device internal state and perform
+ *   final validity and consistency checking of the new device state. If the
+ *   user provided data is found to be incomplete, inconsistent, or otherwise
+ *   invalid, the migration driver must fail the SET_STATE ioctl and
+ *   optionally go to the ERROR state as described below.
+ *
+ *   While in STOP the device has the same behavior as other STOP states
+ *   described above.
+ *
+ *   To abort a RESUMING session the device must be reset.
+ *
+ * STOP -> RUNNING
+ *   While in RUNNING the device is fully operational, the device may generate
+ *   interrupts, DMA, respond to MMIO, all vfio device regions are functional,
+ *   and the device may advance its internal state.
+ *
+ * STOP -> STOP_COPY
+ *   This arc begin the process of saving the device state and will return a
+ *   new data_fd.
+ *
+ *   While in the STOP_COPY state the device has the same behavior as STOP
+ *   with the addition that the data transfers session continues to stream the
+ *   migration state. End of stream on the FD indicates the entire device
+ *   state has been transferred.
+ *
+ *   The user should take steps to restrict access to vfio device regions while
+ *   the device is in STOP_COPY or risk corruption of the device migration data
+ *   stream.
+ *
+ * STOP -> RESUMING
+ *   Entering the RESUMING state starts a process of restoring the device state
+ *   and will return a new data_fd. The data stream fed into the data_fd should
+ *   be taken from the data transfer output of a single FD during saving from
+ *   a compatible device. The migration driver may alter/reset the internal
+ *   device state for this arc if required to prepare the device to receive the
+ *   migration data.
+ *
+ * any -> ERROR
+ *   ERROR cannot be specified as a device state, however any transition request
+ *   can be failed with an errno return and may then move the device_state into
+ *   ERROR. In this case the device was unable to execute the requested arc and
+ *   was also unable to restore the device to any valid device_state.
+ *   To recover from ERROR VFIO_DEVICE_RESET must be used to return the
+ *   device_state back to RUNNING.
+ *
+ * The remaining possible transitions are interpreted as combinations of the
+ * above FSM arcs. As there are multiple paths through the FSM arcs the path
+ * should be selected based on the following rules:
+ *   - Select the shortest path.
+ * Refer to vfio_mig_get_next_state() for the result of the algorithm.
+ *
+ * The automatic transit through the FSM arcs that make up the combination
+ * transition is invisible to the user. When working with combination arcs the
+ * user may see any step along the path in the device_state if SET_STATE
+ * fails. When handling these types of errors users should anticipate future
+ * revisions of this protocol using new states and those states becoming
+ * visible in this case.
+ */
+enum vfio_device_mig_state {
+	VFIO_DEVICE_STATE_ERROR = 0,
+	VFIO_DEVICE_STATE_STOP = 1,
+	VFIO_DEVICE_STATE_RUNNING = 2,
+	VFIO_DEVICE_STATE_STOP_COPY = 3,
+	VFIO_DEVICE_STATE_RESUMING = 4,
+};
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
cgit v1.2.3


From 8cb3d83b959be0631cd719b995c40c3cda21cd47 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 24 Feb 2022 16:20:19 +0200
Subject: vfio: Extend the device migration protocol with RUNNING_P2P

The RUNNING_P2P state is designed to support multiple devices in the same
VM that are doing P2P transactions between themselves. When in RUNNING_P2P
the device must be able to accept incoming P2P transactions but should not
generate outgoing P2P transactions.

As an optional extension to the mandatory states it is defined as
in between STOP and RUNNING:
   STOP -> RUNNING_P2P -> RUNNING -> RUNNING_P2P -> STOP

For drivers that are unable to support RUNNING_P2P the core code
silently merges RUNNING_P2P and RUNNING together. Unless driver support
is present, the new state cannot be used in SET_STATE.
Drivers that support this will be required to implement 4 FSM arcs
beyond the basic FSM. 2 of the basic FSM arcs become combination
transitions.

Compared to the v1 clarification, NDMA is redefined into FSM states and is
described in terms of the desired P2P quiescent behavior, noting that
halting all DMA is an acceptable implementation.

Link: https://lore.kernel.org/all/20220224142024.147653-11-yishaih@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/vfio.c       | 87 +++++++++++++++++++++++++++++++++++++----------
 include/linux/vfio.h      |  1 +
 include/uapi/linux/vfio.h | 36 ++++++++++++++++++--
 3 files changed, 104 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index b37ab27b511f..a4555014bd1e 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1577,39 +1577,56 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 			    enum vfio_device_mig_state new_fsm,
 			    enum vfio_device_mig_state *next_fsm)
 {
-	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RESUMING + 1 };
+	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
 	/*
-	 * The coding in this table requires the driver to implement 6
-	 * FSM arcs:
+	 * The coding in this table requires the driver to implement the
+	 * following FSM arcs:
 	 *         RESUMING -> STOP
-	 *         RUNNING -> STOP
 	 *         STOP -> RESUMING
-	 *         STOP -> RUNNING
 	 *         STOP -> STOP_COPY
 	 *         STOP_COPY -> STOP
 	 *
-	 * The coding will step through multiple states for these combination
-	 * transitions:
-	 *         RESUMING -> STOP -> RUNNING
+	 * If P2P is supported then the driver must also implement these FSM
+	 * arcs:
+	 *         RUNNING -> RUNNING_P2P
+	 *         RUNNING_P2P -> RUNNING
+	 *         RUNNING_P2P -> STOP
+	 *         STOP -> RUNNING_P2P
+	 * Without P2P the driver must implement:
+	 *         RUNNING -> STOP
+	 *         STOP -> RUNNING
+	 *
+	 * The coding will step through multiple states for some combination
+	 * transitions; if all optional features are supported, this means the
+	 * following ones:
+	 *         RESUMING -> STOP -> RUNNING_P2P
+	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
 	 *         RESUMING -> STOP -> STOP_COPY
-	 *         RUNNING -> STOP -> RESUMING
-	 *         RUNNING -> STOP -> STOP_COPY
+	 *         RUNNING -> RUNNING_P2P -> STOP
+	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
+	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
+	 *         RUNNING_P2P -> STOP -> RESUMING
+	 *         RUNNING_P2P -> STOP -> STOP_COPY
+	 *         STOP -> RUNNING_P2P -> RUNNING
 	 *         STOP_COPY -> STOP -> RESUMING
-	 *         STOP_COPY -> STOP -> RUNNING
+	 *         STOP_COPY -> STOP -> RUNNING_P2P
+	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
 	 */
 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
 		[VFIO_DEVICE_STATE_STOP] = {
 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
-			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 		},
 		[VFIO_DEVICE_STATE_RUNNING] = {
-			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
-			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
-			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 		},
 		[VFIO_DEVICE_STATE_STOP_COPY] = {
@@ -1617,6 +1634,7 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 		},
 		[VFIO_DEVICE_STATE_RESUMING] = {
@@ -1624,6 +1642,15 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+		},
+		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
+			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 		},
 		[VFIO_DEVICE_STATE_ERROR] = {
@@ -1631,17 +1658,41 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
+			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
 		},
 	};
 
-	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table)))
+	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
+		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
+		[VFIO_DEVICE_STATE_RUNNING_P2P] =
+			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
+		[VFIO_DEVICE_STATE_ERROR] = ~0U,
+	};
+
+	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+		    (state_flags_table[cur_fsm] & device->migration_flags) !=
+			state_flags_table[cur_fsm]))
 		return -EINVAL;
 
-	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table))
+	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+	   (state_flags_table[new_fsm] & device->migration_flags) !=
+			state_flags_table[new_fsm])
 		return -EINVAL;
 
+	/*
+	 * Arcs touching optional and unsupported states are skipped over. The
+	 * driver will instead see an arc from the original state to the next
+	 * logical state, as per the above comment.
+	 */
 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
+	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
+			state_flags_table[*next_fsm])
+		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
+
 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
 }
 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
@@ -1731,7 +1782,7 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 					       size_t argsz)
 {
 	struct vfio_device_feature_migration mig = {
-		.flags = VFIO_MIGRATION_STOP_COPY,
+		.flags = device->migration_flags,
 	};
 	int ret;
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index c44e80bbbd3b..66dda06ec42d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -33,6 +33,7 @@ struct vfio_device {
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
 	struct list_head dev_set_list;
+	unsigned int migration_flags;
 
 	/* Members below here are private, not for driver use */
 	refcount_t refcount;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 22ed358c04c5..26a66f68371d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1011,10 +1011,16 @@ struct vfio_device_feature {
  *
  * VFIO_MIGRATION_STOP_COPY means that STOP, STOP_COPY and
  * RESUMING are supported.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
+ * is supported in addition to the STOP_COPY states.
+ *
+ * Other combinations of flags have behavior to be defined in the future.
  */
 struct vfio_device_feature_migration {
 	__aligned_u64 flags;
 #define VFIO_MIGRATION_STOP_COPY	(1 << 0)
+#define VFIO_MIGRATION_P2P		(1 << 1)
 };
 #define VFIO_DEVICE_FEATURE_MIGRATION 1
 
@@ -1065,10 +1071,13 @@ struct vfio_device_feature_mig_state {
  *  RESUMING - The device is stopped and is loading a new internal state
  *  ERROR - The device has failed and must be reset
  *
+ * And 1 optional state to support VFIO_MIGRATION_P2P:
+ *  RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ *
  * The FSM takes actions on the arcs between FSM states. The driver implements
  * the following behavior for the FSM arcs:
  *
- * RUNNING -> STOP
+ * RUNNING_P2P -> STOP
  * STOP_COPY -> STOP
  *   While in STOP the device must stop the operation of the device. The device
  *   must not generate interrupts, DMA, or any other change to external state.
@@ -1095,11 +1104,16 @@ struct vfio_device_feature_mig_state {
  *
  *   To abort a RESUMING session the device must be reset.
  *
- * STOP -> RUNNING
+ * RUNNING_P2P -> RUNNING
  *   While in RUNNING the device is fully operational, the device may generate
  *   interrupts, DMA, respond to MMIO, all vfio device regions are functional,
  *   and the device may advance its internal state.
  *
+ * RUNNING -> RUNNING_P2P
+ * STOP -> RUNNING_P2P
+ *   While in RUNNING_P2P the device is partially running in the P2P quiescent
+ *   state defined below.
+ *
  * STOP -> STOP_COPY
  *   This arc begin the process of saving the device state and will return a
  *   new data_fd.
@@ -1129,6 +1143,18 @@ struct vfio_device_feature_mig_state {
  *   To recover from ERROR VFIO_DEVICE_RESET must be used to return the
  *   device_state back to RUNNING.
  *
+ * The optional peer to peer (P2P) quiescent state is intended to be a quiescent
+ * state for the device for the purposes of managing multiple devices within a
+ * user context where peer-to-peer DMA between devices may be active. The
+ * RUNNING_P2P states must prevent the device from initiating
+ * any new P2P DMA transactions. If the device can identify P2P transactions
+ * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
+ * driver must complete any such outstanding operations prior to completing the
+ * FSM arc into a P2P state. For the purpose of specification the states
+ * behave as though the device was fully running if not supported. Like while in
+ * STOP or STOP_COPY the user must not touch the device, otherwise the state
+ * can be exited.
+ *
  * The remaining possible transitions are interpreted as combinations of the
  * above FSM arcs. As there are multiple paths through the FSM arcs the path
  * should be selected based on the following rules:
@@ -1141,6 +1167,11 @@ struct vfio_device_feature_mig_state {
  * fails. When handling these types of errors users should anticipate future
  * revisions of this protocol using new states and those states becoming
  * visible in this case.
+ *
+ * The optional states cannot be used with SET_STATE if the device does not
+ * support them. The user can discover if these states are supported by using
+ * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
+ * avoid knowing about these optional states if the kernel driver supports them.
  */
 enum vfio_device_mig_state {
 	VFIO_DEVICE_STATE_ERROR = 0,
@@ -1148,6 +1179,7 @@ enum vfio_device_mig_state {
 	VFIO_DEVICE_STATE_RUNNING = 2,
 	VFIO_DEVICE_STATE_STOP_COPY = 3,
 	VFIO_DEVICE_STATE_RESUMING = 4,
+	VFIO_DEVICE_STATE_RUNNING_P2P = 5,
 };
 
 /* -------- API for Type1 VFIO IOMMU -------- */
-- 
cgit v1.2.3


From 0f3f9cd7f752f6e685e378620babc5e34af6fb9f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 24 Feb 2022 16:20:20 +0200
Subject: vfio: Remove migration protocol v1 documentation

v1 was never implemented and is replaced by v2.

The old uAPI documentation is removed from the header file.

The old uAPI definitions are still kept in the header file to ease
transition for userspace copying these headers. They will be fully
removed down the road.

Link: https://lore.kernel.org/all/20220224142024.147653-12-yishaih@nvidia.com
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/uapi/linux/vfio.h | 200 +---------------------------------------------
 1 file changed, 2 insertions(+), 198 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 26a66f68371d..fea86061b44e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -323,7 +323,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
 #define VFIO_REGION_TYPE_GFX                    (1)
 #define VFIO_REGION_TYPE_CCW			(2)
-#define VFIO_REGION_TYPE_MIGRATION              (3)
+#define VFIO_REGION_TYPE_MIGRATION_DEPRECATED   (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -405,203 +405,7 @@ struct vfio_region_gfx_edid {
 #define VFIO_REGION_SUBTYPE_CCW_CRW		(3)
 
 /* sub-types for VFIO_REGION_TYPE_MIGRATION */
-#define VFIO_REGION_SUBTYPE_MIGRATION           (1)
-
-/*
- * The structure vfio_device_migration_info is placed at the 0th offset of
- * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
- * migration information. Field accesses from this structure are only supported
- * at their native width and alignment. Otherwise, the result is undefined and
- * vendor drivers should return an error.
- *
- * device_state: (read/write)
- *      - The user application writes to this field to inform the vendor driver
- *        about the device state to be transitioned to.
- *      - The vendor driver should take the necessary actions to change the
- *        device state. After successful transition to a given state, the
- *        vendor driver should return success on write(device_state, state)
- *        system call. If the device state transition fails, the vendor driver
- *        should return an appropriate -errno for the fault condition.
- *      - On the user application side, if the device state transition fails,
- *	  that is, if write(device_state, state) returns an error, read
- *	  device_state again to determine the current state of the device from
- *	  the vendor driver.
- *      - The vendor driver should return previous state of the device unless
- *        the vendor driver has encountered an internal error, in which case
- *        the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
- *      - The user application must use the device reset ioctl to recover the
- *        device from VFIO_DEVICE_STATE_ERROR state. If the device is
- *        indicated to be in a valid device state by reading device_state, the
- *        user application may attempt to transition the device to any valid
- *        state reachable from the current state or terminate itself.
- *
- *      device_state consists of 3 bits:
- *      - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
- *        it indicates the _STOP state. When the device state is changed to
- *        _STOP, driver should stop the device before write() returns.
- *      - If bit 1 is set, it indicates the _SAVING state, which means that the
- *        driver should start gathering device state information that will be
- *        provided to the VFIO user application to save the device's state.
- *      - If bit 2 is set, it indicates the _RESUMING state, which means that
- *        the driver should prepare to resume the device. Data provided through
- *        the migration region should be used to resume the device.
- *      Bits 3 - 31 are reserved for future use. To preserve them, the user
- *      application should perform a read-modify-write operation on this
- *      field when modifying the specified bits.
- *
- *  +------- _RESUMING
- *  |+------ _SAVING
- *  ||+----- _RUNNING
- *  |||
- *  000b => Device Stopped, not saving or resuming
- *  001b => Device running, which is the default state
- *  010b => Stop the device & save the device state, stop-and-copy state
- *  011b => Device running and save the device state, pre-copy state
- *  100b => Device stopped and the device state is resuming
- *  101b => Invalid state
- *  110b => Error state
- *  111b => Invalid state
- *
- * State transitions:
- *
- *              _RESUMING  _RUNNING    Pre-copy    Stop-and-copy   _STOP
- *                (100b)     (001b)     (011b)        (010b)       (000b)
- * 0. Running or default state
- *                             |
- *
- * 1. Normal Shutdown (optional)
- *                             |------------------------------------->|
- *
- * 2. Save the state or suspend
- *                             |------------------------->|---------->|
- *
- * 3. Save the state during live migration
- *                             |----------->|------------>|---------->|
- *
- * 4. Resuming
- *                  |<---------|
- *
- * 5. Resumed
- *                  |--------->|
- *
- * 0. Default state of VFIO device is _RUNNING when the user application starts.
- * 1. During normal shutdown of the user application, the user application may
- *    optionally change the VFIO device state from _RUNNING to _STOP. This
- *    transition is optional. The vendor driver must support this transition but
- *    must not require it.
- * 2. When the user application saves state or suspends the application, the
- *    device state transitions from _RUNNING to stop-and-copy and then to _STOP.
- *    On state transition from _RUNNING to stop-and-copy, driver must stop the
- *    device, save the device state and send it to the application through the
- *    migration region. The sequence to be followed for such transition is given
- *    below.
- * 3. In live migration of user application, the state transitions from _RUNNING
- *    to pre-copy, to stop-and-copy, and to _STOP.
- *    On state transition from _RUNNING to pre-copy, the driver should start
- *    gathering the device state while the application is still running and send
- *    the device state data to application through the migration region.
- *    On state transition from pre-copy to stop-and-copy, the driver must stop
- *    the device, save the device state and send it to the user application
- *    through the migration region.
- *    Vendor drivers must support the pre-copy state even for implementations
- *    where no data is provided to the user before the stop-and-copy state. The
- *    user must not be required to consume all migration data before the device
- *    transitions to a new state, including the stop-and-copy state.
- *    The sequence to be followed for above two transitions is given below.
- * 4. To start the resuming phase, the device state should be transitioned from
- *    the _RUNNING to the _RESUMING state.
- *    In the _RESUMING state, the driver should use the device state data
- *    received through the migration region to resume the device.
- * 5. After providing saved device data to the driver, the application should
- *    change the state from _RESUMING to _RUNNING.
- *
- * reserved:
- *      Reads on this field return zero and writes are ignored.
- *
- * pending_bytes: (read only)
- *      The number of pending bytes still to be migrated from the vendor driver.
- *
- * data_offset: (read only)
- *      The user application should read data_offset field from the migration
- *      region. The user application should read the device data from this
- *      offset within the migration region during the _SAVING state or write
- *      the device data during the _RESUMING state. See below for details of
- *      sequence to be followed.
- *
- * data_size: (read/write)
- *      The user application should read data_size to get the size in bytes of
- *      the data copied in the migration region during the _SAVING state and
- *      write the size in bytes of the data copied in the migration region
- *      during the _RESUMING state.
- *
- * The format of the migration region is as follows:
- *  ------------------------------------------------------------------
- * |vfio_device_migration_info|    data section                      |
- * |                          |     ///////////////////////////////  |
- * ------------------------------------------------------------------
- *   ^                              ^
- *  offset 0-trapped part        data_offset
- *
- * The structure vfio_device_migration_info is always followed by the data
- * section in the region, so data_offset will always be nonzero. The offset
- * from where the data is copied is decided by the kernel driver. The data
- * section can be trapped, mmapped, or partitioned, depending on how the kernel
- * driver defines the data section. The data section partition can be defined
- * as mapped by the sparse mmap capability. If mmapped, data_offset must be
- * page aligned, whereas initial section which contains the
- * vfio_device_migration_info structure, might not end at the offset, which is
- * page aligned. The user is not required to access through mmap regardless
- * of the capabilities of the region mmap.
- * The vendor driver should determine whether and how to partition the data
- * section. The vendor driver should return data_offset accordingly.
- *
- * The sequence to be followed while in pre-copy state and stop-and-copy state
- * is as follows:
- * a. Read pending_bytes, indicating the start of a new iteration to get device
- *    data. Repeated read on pending_bytes at this stage should have no side
- *    effects.
- *    If pending_bytes == 0, the user application should not iterate to get data
- *    for that device.
- *    If pending_bytes > 0, perform the following steps.
- * b. Read data_offset, indicating that the vendor driver should make data
- *    available through the data section. The vendor driver should return this
- *    read operation only after data is available from (region + data_offset)
- *    to (region + data_offset + data_size).
- * c. Read data_size, which is the amount of data in bytes available through
- *    the migration region.
- *    Read on data_offset and data_size should return the offset and size of
- *    the current buffer if the user application reads data_offset and
- *    data_size more than once here.
- * d. Read data_size bytes of data from (region + data_offset) from the
- *    migration region.
- * e. Process the data.
- * f. Read pending_bytes, which indicates that the data from the previous
- *    iteration has been read. If pending_bytes > 0, go to step b.
- *
- * The user application can transition from the _SAVING|_RUNNING
- * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
- * number of pending bytes. The user application should iterate in _SAVING
- * (stop-and-copy) until pending_bytes is 0.
- *
- * The sequence to be followed while _RESUMING device state is as follows:
- * While data for this device is available, repeat the following steps:
- * a. Read data_offset from where the user application should write data.
- * b. Write migration data starting at the migration region + data_offset for
- *    the length determined by data_size from the migration source.
- * c. Write data_size, which indicates to the vendor driver that data is
- *    written in the migration region. Vendor driver must return this write
- *    operations on consuming data. Vendor driver should apply the
- *    user-provided migration region data to the device resume state.
- *
- * If an error occurs during the above sequences, the vendor driver can return
- * an error code for next read() or write() operation, which will terminate the
- * loop. The user application should then take the next necessary action, for
- * example, failing migration or terminating the user application.
- *
- * For the user application, data is opaque. The user application should write
- * data in the same order as the data is received and the data should be of
- * same transaction size at the source.
- */
+#define VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED (1)
 
 struct vfio_device_migration_info {
 	__u32 device_state;         /* VFIO device state */
-- 
cgit v1.2.3


From 8d21ec0e46ed6e39994accff8eb4f2be3d2e76b5 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 2 Mar 2022 11:56:34 -0800
Subject: bpf: Add __sk_buff->delivery_time_type and
 bpf_skb_set_skb_delivery_time()

* __sk_buff->delivery_time_type:
This patch adds __sk_buff->delivery_time_type.  It tells if the
delivery_time is stored in __sk_buff->tstamp or not.

It will be most useful for ingress to tell if the __sk_buff->tstamp
has the (rcv) timestamp or delivery_time.  If delivery_time_type
is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp.

Two non-zero types are defined for the delivery_time_type,
BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC.  For UNSPEC,
it can only happen in egress because only mono delivery_time can be
forwarded to ingress now.  The clock of UNSPEC delivery_time
can be deduced from the skb->sk->sk_clockid which is how
the sch_etf doing it also.

* Provide forwarded delivery_time to tc-bpf@ingress:
With the help of the new delivery_time_type, the tc-bpf has a way
to tell if the __sk_buff->tstamp has the (rcv) timestamp or
the delivery_time.  During bpf load time, the verifier will learn if
the bpf prog has accessed the new __sk_buff->delivery_time_type.
If it does, it means the tc-bpf@ingress is expecting the
skb->tstamp could have the delivery_time.  The kernel will then
read the skb->tstamp as-is during bpf insn rewrite without
checking the skb->mono_delivery_time.  This is done by adding a
new prog->delivery_time_access bit.  The same goes for
writing skb->tstamp.

* bpf_skb_set_delivery_time():
The bpf_skb_set_delivery_time() helper is added to allow setting both
delivery_time and the delivery_time_type at the same time.  If the
tc-bpf does not need to change the delivery_time_type, it can directly
write to the __sk_buff->tstamp as the existing tc-bpf has already been
doing.  It will be most useful at ingress to change the
__sk_buff->tstamp from the (rcv) timestamp to
a mono delivery_time and then bpf_redirect_*().

bpf only has mono clock helper (bpf_ktime_get_ns), and
the current known use case is the mono EDT for fq, and
only mono delivery time can be kept during forward now,
so bpf_skb_set_delivery_time() only supports setting
BPF_SKB_DELIVERY_TIME_MONO.  It can be extended later when use cases
come up and the forwarding path also supports other clock bases.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h         |   3 +-
 include/uapi/linux/bpf.h       |  41 +++++++++-
 net/core/filter.c              | 169 ++++++++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h |  41 +++++++++-
 4 files changed, 216 insertions(+), 38 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1cb1af917617..9bf26307247f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -572,7 +572,8 @@ struct bpf_prog {
 				has_callchain_buf:1, /* callchain buffer allocated? */
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1; /* Do we call get_func_ip() */
+				call_get_func_ip:1, /* Do we call get_func_ip() */
+				delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index afe3d0d7f5f2..4eebea830613 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5086,6 +5086,37 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure. On error
  *		*dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ *	Description
+ *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ *		When setting a delivery time (non zero *dtime*) to
+ *		__sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
+ *		is supported.  It is the only delivery_time_type that will be
+ *		kept after bpf_redirect_*().
+ *
+ *		If there is no need to change the __sk_buff->delivery_time_type,
+ *		the delivery time can be directly written to __sk_buff->tstamp
+ *		instead.
+ *
+ *		*dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
+ *		can be used to clear any delivery time stored in
+ *		__sk_buff->tstamp.
+ *
+ *		Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ *		This function is most useful when it needs to set a
+ *		mono delivery time to __sk_buff->tstamp and then
+ *		bpf_redirect_*() to the egress of an iface.  For example,
+ *		changing the (rcv) timestamp in __sk_buff->tstamp at
+ *		ingress to a mono delivery time and then bpf_redirect_*()
+ *		to sch_fq@phy-dev.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** for invalid input
+ *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5280,6 +5311,7 @@ union bpf_attr {
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
+	FN(skb_set_delivery_time),      \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+enum {
+	BPF_SKB_DELIVERY_TIME_NONE,
+	BPF_SKB_DELIVERY_TIME_UNSPEC,
+	BPF_SKB_DELIVERY_TIME_MONO,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -5509,7 +5547,8 @@ struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u32 :32;		/* Padding, future use. */
+	__u8  delivery_time_type;
+	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 5072733743e9..88767f7da150 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7388,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
+	   u64, dtime, u32, dtime_type)
+{
+	/* skb_clear_delivery_time() is done for inet protocol */
+	if (skb->protocol != htons(ETH_P_IP) &&
+	    skb->protocol != htons(ETH_P_IPV6))
+		return -EOPNOTSUPP;
+
+	switch (dtime_type) {
+	case BPF_SKB_DELIVERY_TIME_MONO:
+		if (!dtime)
+			return -EINVAL;
+		skb->tstamp = dtime;
+		skb->mono_delivery_time = 1;
+		break;
+	case BPF_SKB_DELIVERY_TIME_NONE:
+		if (dtime)
+			return -EINVAL;
+		skb->tstamp = 0;
+		skb->mono_delivery_time = 0;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
+	.func           = bpf_skb_set_delivery_time,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -7749,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_gen_syncookie_proto;
 	case BPF_FUNC_sk_assign:
 		return &bpf_sk_assign_proto;
+	case BPF_FUNC_skb_set_delivery_time:
+		return &bpf_skb_set_delivery_time_proto;
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
@@ -8088,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 			return false;
 		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
 		break;
-	case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+	case offsetof(struct __sk_buff, delivery_time_type):
+		return false;
+	case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
 		/* Explicitly prohibit access to padding in __sk_buff. */
 		return false;
 	default:
@@ -8443,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		break;
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
+	case offsetof(struct __sk_buff, delivery_time_type):
+		/* The convert_ctx_access() on reading and writing
+		 * __sk_buff->tstamp depends on whether the bpf prog
+		 * has used __sk_buff->delivery_time_type or not.
+		 * Thus, we need to set prog->delivery_time_access
+		 * earlier during is_valid_access() here.
+		 */
+		((struct bpf_prog *)prog)->delivery_time_access = 1;
+		return size == sizeof(__u8);
 	}
 
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
@@ -8838,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
+						    struct bpf_insn *insn)
+{
+	__u8 value_reg = si->dst_reg;
+	__u8 skb_reg = si->src_reg;
+	__u8 tmp_reg = BPF_REG_AX;
+
+	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+			      SKB_MONO_DELIVERY_TIME_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+				SKB_MONO_DELIVERY_TIME_MASK);
+	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+	/* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
+	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
+
+	*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
+			      offsetof(struct sk_buff, tstamp));
+	*insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
+	/* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
+	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
+
+#ifdef CONFIG_NET_CLS_ACT
+	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+	/* At ingress, value_reg = 0 */
+	*insn++ = BPF_MOV32_IMM(value_reg, 0);
+	*insn++ = BPF_JMP_A(1);
+#endif
+
+	/* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
+
+	/* 15 insns with CONFIG_NET_CLS_ACT */
+	return insn;
+}
+
 static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
 						  struct bpf_insn *insn)
 {
@@ -8859,29 +8948,32 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
 	return insn;
 }
 
-static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si,
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+						const struct bpf_insn *si,
 						struct bpf_insn *insn)
 {
 	__u8 value_reg = si->dst_reg;
 	__u8 skb_reg = si->src_reg;
 
 #ifdef CONFIG_NET_CLS_ACT
-	__u8 tmp_reg = BPF_REG_AX;
-
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
-	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
-	/* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
-	 * so check the skb->mono_delivery_time.
-	 */
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      SKB_MONO_DELIVERY_TIME_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
-				SKB_MONO_DELIVERY_TIME_MASK);
-	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
-	/* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
-	*insn++ = BPF_MOV64_IMM(value_reg, 0);
-	*insn++ = BPF_JMP_A(1);
+	if (!prog->delivery_time_access) {
+		__u8 tmp_reg = BPF_REG_AX;
+
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
+		/* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
+		 * so check the skb->mono_delivery_time.
+		 */
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+				      SKB_MONO_DELIVERY_TIME_OFFSET);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+					SKB_MONO_DELIVERY_TIME_MASK);
+		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+		/* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
+		*insn++ = BPF_MOV64_IMM(value_reg, 0);
+		*insn++ = BPF_JMP_A(1);
+	}
 #endif
 
 	*insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
@@ -8889,27 +8981,30 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si,
 	return insn;
 }
 
-static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si,
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+						 const struct bpf_insn *si,
 						 struct bpf_insn *insn)
 {
 	__u8 value_reg = si->src_reg;
 	__u8 skb_reg = si->dst_reg;
 
 #ifdef CONFIG_NET_CLS_ACT
-	__u8 tmp_reg = BPF_REG_AX;
-
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
-	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
-	/* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
-	 * Clear the skb->mono_delivery_time.
-	 */
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      SKB_MONO_DELIVERY_TIME_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
-				~SKB_MONO_DELIVERY_TIME_MASK);
-	*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
-			      SKB_MONO_DELIVERY_TIME_OFFSET);
+	if (!prog->delivery_time_access) {
+		__u8 tmp_reg = BPF_REG_AX;
+
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+		*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
+		/* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
+		 * Clear the skb->mono_delivery_time.
+		 */
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+				      SKB_MONO_DELIVERY_TIME_OFFSET);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+					~SKB_MONO_DELIVERY_TIME_MASK);
+		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
+				      SKB_MONO_DELIVERY_TIME_OFFSET);
+	}
 #endif
 
 	/* skb->tstamp = tstamp */
@@ -9226,9 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
 
 		if (type == BPF_WRITE)
-			insn = bpf_convert_tstamp_write(si, insn);
+			insn = bpf_convert_tstamp_write(prog, si, insn);
 		else
-			insn = bpf_convert_tstamp_read(si, insn);
+			insn = bpf_convert_tstamp_read(prog, si, insn);
+		break;
+
+	case offsetof(struct __sk_buff, delivery_time_type):
+		insn = bpf_convert_dtime_type_read(si, insn);
 		break;
 
 	case offsetof(struct __sk_buff, gso_segs):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index afe3d0d7f5f2..4eebea830613 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5086,6 +5086,37 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure. On error
  *		*dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ *	Description
+ *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ *		When setting a delivery time (non zero *dtime*) to
+ *		__sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
+ *		is supported.  It is the only delivery_time_type that will be
+ *		kept after bpf_redirect_*().
+ *
+ *		If there is no need to change the __sk_buff->delivery_time_type,
+ *		the delivery time can be directly written to __sk_buff->tstamp
+ *		instead.
+ *
+ *		*dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
+ *		can be used to clear any delivery time stored in
+ *		__sk_buff->tstamp.
+ *
+ *		Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ *		This function is most useful when it needs to set a
+ *		mono delivery time to __sk_buff->tstamp and then
+ *		bpf_redirect_*() to the egress of an iface.  For example,
+ *		changing the (rcv) timestamp in __sk_buff->tstamp at
+ *		ingress to a mono delivery time and then bpf_redirect_*()
+ *		to sch_fq@phy-dev.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** for invalid input
+ *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5280,6 +5311,7 @@ union bpf_attr {
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
+	FN(skb_set_delivery_time),      \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+enum {
+	BPF_SKB_DELIVERY_TIME_NONE,
+	BPF_SKB_DELIVERY_TIME_UNSPEC,
+	BPF_SKB_DELIVERY_TIME_MONO,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -5509,7 +5547,8 @@ struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u32 :32;		/* Padding, future use. */
+	__u8  delivery_time_type;
+	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
 
-- 
cgit v1.2.3


From 51ef2be546e2e480e56fdb59fdeb9a4406e8d52e Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Thu, 17 Feb 2022 16:44:07 +0100
Subject: media: i2c: isl7998x: Add driver for Intersil ISL7998x

Add driver for the Intersil ISL7998x Analog to MIPI CSI-2/BT656 decoder.
This chip supports 1/2/4 analog video inputs and converts them into
1/2/4 VCs in MIPI CSI2 stream.

This driver currently supports ISL79987 and both 720x480 and 720x576
resolutions, however as per specification, all inputs must use the
same resolution and standard. The only supported pixel format is now
YUYV/YUV422. The chip should support RGB565 on the CSI2 as well, but
this is currently unsupported.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Cc: Rob Herring <robh+dt@kernel.org>
To: linux-media@vger.kernel.org
Signed-off-by: Michael Tretter <m.tretter@pengutronix.de>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[Sakari Ailus: Always call pm_runtime_get_and_resume in pre_streamon]
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
---
 MAINTAINERS                        |    8 +
 drivers/media/i2c/Kconfig          |   10 +
 drivers/media/i2c/Makefile         |    1 +
 drivers/media/i2c/isl7998x.c       | 1628 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/v4l2-controls.h |    6 +
 5 files changed, 1653 insertions(+)
 create mode 100644 drivers/media/i2c/isl7998x.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index b3949b10dbbf..50015ff7ff54 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10006,6 +10006,14 @@ L:	linux-iio@vger.kernel.org
 F:	Documentation/devicetree/bindings/counter/interrupt-counter.yaml
 F:	drivers/counter/interrupt-cnt.c
 
+INTERSIL ISL7998X VIDEO DECODER DRIVER
+M:	Michael Tretter <m.tretter@pengutronix.de>
+R:	Pengutronix Kernel Team <kernel@pengutronix.de>
+L:	linux-media@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/media/i2c/isil,isl79987.yaml
+F:	drivers/media/i2c/isl7998x.c
+
 INVENSENSE ICM-426xx IMU DRIVER
 M:	Jean-Baptiste Maneyrol <jmaneyrol@invensense.com>
 L:	linux-iio@vger.kernel.org
diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig
index 508145b572cf..e7194c1be4d2 100644
--- a/drivers/media/i2c/Kconfig
+++ b/drivers/media/i2c/Kconfig
@@ -325,6 +325,16 @@ config VIDEO_BT866
 	  To compile this driver as a module, choose M here: the
 	  module will be called bt866.
 
+config VIDEO_ISL7998X
+	tristate "Intersil ISL7998x video decoder"
+	depends on VIDEO_V4L2 && I2C
+	depends on OF_GPIO
+	select MEDIA_CONTROLLER
+	select VIDEO_V4L2_SUBDEV_API
+	help
+	  Support for Intersil ISL7998x analog to MIPI-CSI2 or
+	  BT.656 decoder.
+
 config VIDEO_KS0127
 	tristate "KS0127 video decoder"
 	depends on VIDEO_V4L2 && I2C
diff --git a/drivers/media/i2c/Makefile b/drivers/media/i2c/Makefile
index d85d7a7e9c0f..7f8c1df60330 100644
--- a/drivers/media/i2c/Makefile
+++ b/drivers/media/i2c/Makefile
@@ -134,6 +134,7 @@ obj-$(CONFIG_VIDEO_IMX334)	+= imx334.o
 obj-$(CONFIG_VIDEO_IMX335)	+= imx335.o
 obj-$(CONFIG_VIDEO_IMX355)	+= imx355.o
 obj-$(CONFIG_VIDEO_IMX412)	+= imx412.o
+obj-$(CONFIG_VIDEO_ISL7998X)	+= isl7998x.o
 obj-$(CONFIG_VIDEO_MAX9286)	+= max9286.o
 obj-$(CONFIG_VIDEO_MAX9271_LIB)	+= max9271.o
 obj-$(CONFIG_VIDEO_RDACM20)	+= rdacm20.o
diff --git a/drivers/media/i2c/isl7998x.c b/drivers/media/i2c/isl7998x.c
new file mode 100644
index 000000000000..dc3068549dfa
--- /dev/null
+++ b/drivers/media/i2c/isl7998x.c
@@ -0,0 +1,1628 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intersil ISL7998x analog to MIPI CSI-2 or BT.656 decoder driver.
+ *
+ * Copyright (C) 2018-2019 Marek Vasut <marex@denx.de>
+ * Copyright (C) 2021 Michael Tretter <kernel@pengutronix.de>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/of_graph.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/v4l2-mediabus.h>
+#include <linux/videodev2.h>
+
+#include <media/v4l2-async.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-fwnode.h>
+#include <media/v4l2-ioctl.h>
+
+/*
+ * This control allows to activate and deactivate the test pattern on
+ * selected output channels.
+ * This value is ISL7998x specific.
+ */
+#define V4L2_CID_TEST_PATTERN_CHANNELS	(V4L2_CID_USER_ISL7998X_BASE + 0)
+
+/*
+ * This control allows to specify the color of the test pattern.
+ * This value is ISL7998x specific.
+ */
+#define V4L2_CID_TEST_PATTERN_COLOR	(V4L2_CID_USER_ISL7998X_BASE + 1)
+
+/*
+ * This control allows to specify the bar pattern in the test pattern.
+ * This value is ISL7998x specific.
+ */
+#define V4L2_CID_TEST_PATTERN_BARS	(V4L2_CID_USER_ISL7998X_BASE + 2)
+
+#define ISL7998X_INPUTS			4
+
+#define ISL7998X_REG(page, reg)		(((page) << 8) | (reg))
+
+#define ISL7998X_REG_PN_SIZE			256
+#define ISL7998X_REG_PN_BASE(n)			((n) * ISL7998X_REG_PN_SIZE)
+
+#define ISL7998X_REG_PX_DEC_PAGE(page)		ISL7998X_REG((page), 0xff)
+#define ISL7998X_REG_PX_DEC_PAGE_MASK		0xf
+#define ISL7998X_REG_P0_PRODUCT_ID_CODE		ISL7998X_REG(0, 0x00)
+#define ISL7998X_REG_P0_PRODUCT_REV_CODE	ISL7998X_REG(0, 0x01)
+#define ISL7998X_REG_P0_SW_RESET_CTL		ISL7998X_REG(0, 0x02)
+#define ISL7998X_REG_P0_IO_BUFFER_CTL		ISL7998X_REG(0, 0x03)
+#define ISL7998X_REG_P0_IO_BUFFER_CTL_1_1	ISL7998X_REG(0, 0x04)
+#define ISL7998X_REG_P0_IO_PAD_PULL_EN_CTL	ISL7998X_REG(0, 0x05)
+#define ISL7998X_REG_P0_IO_BUFFER_CTL_1_2	ISL7998X_REG(0, 0x06)
+#define ISL7998X_REG_P0_VIDEO_IN_CHAN_CTL	ISL7998X_REG(0, 0x07)
+#define ISL7998X_REG_P0_CLK_CTL_1		ISL7998X_REG(0, 0x08)
+#define ISL7998X_REG_P0_CLK_CTL_2		ISL7998X_REG(0, 0x09)
+#define ISL7998X_REG_P0_CLK_CTL_3		ISL7998X_REG(0, 0x0a)
+#define ISL7998X_REG_P0_CLK_CTL_4		ISL7998X_REG(0, 0x0b)
+#define ISL7998X_REG_P0_MPP1_SYNC_CTL		ISL7998X_REG(0, 0x0c)
+#define ISL7998X_REG_P0_MPP2_SYNC_CTL		ISL7998X_REG(0, 0x0d)
+#define ISL7998X_REG_P0_IRQ_SYNC_CTL		ISL7998X_REG(0, 0x0e)
+#define ISL7998X_REG_P0_INTERRUPT_STATUS	ISL7998X_REG(0, 0x10)
+#define ISL7998X_REG_P0_CHAN_1_IRQ		ISL7998X_REG(0, 0x11)
+#define ISL7998X_REG_P0_CHAN_2_IRQ		ISL7998X_REG(0, 0x12)
+#define ISL7998X_REG_P0_CHAN_3_IRQ		ISL7998X_REG(0, 0x13)
+#define ISL7998X_REG_P0_CHAN_4_IRQ		ISL7998X_REG(0, 0x14)
+#define ISL7998X_REG_P0_SHORT_DIAG_IRQ		ISL7998X_REG(0, 0x15)
+#define ISL7998X_REG_P0_CHAN_1_IRQ_EN		ISL7998X_REG(0, 0x16)
+#define ISL7998X_REG_P0_CHAN_2_IRQ_EN		ISL7998X_REG(0, 0x17)
+#define ISL7998X_REG_P0_CHAN_3_IRQ_EN		ISL7998X_REG(0, 0x18)
+#define ISL7998X_REG_P0_CHAN_4_IRQ_EN		ISL7998X_REG(0, 0x19)
+#define ISL7998X_REG_P0_SHORT_DIAG_IRQ_EN	ISL7998X_REG(0, 0x1a)
+#define ISL7998X_REG_P0_CHAN_1_STATUS		ISL7998X_REG(0, 0x1b)
+#define ISL7998X_REG_P0_CHAN_2_STATUS		ISL7998X_REG(0, 0x1c)
+#define ISL7998X_REG_P0_CHAN_3_STATUS		ISL7998X_REG(0, 0x1d)
+#define ISL7998X_REG_P0_CHAN_4_STATUS		ISL7998X_REG(0, 0x1e)
+#define ISL7998X_REG_P0_SHORT_DIAG_STATUS	ISL7998X_REG(0, 0x1f)
+#define ISL7998X_REG_P0_CLOCK_DELAY		ISL7998X_REG(0, 0x20)
+
+#define ISL7998X_REG_PX_DEC_INPUT_FMT(pg)	ISL7998X_REG((pg), 0x02)
+#define ISL7998X_REG_PX_DEC_STATUS_1(pg)	ISL7998X_REG((pg), 0x03)
+#define ISL7998X_REG_PX_DEC_STATUS_1_VDLOSS	BIT(7)
+#define ISL7998X_REG_PX_DEC_STATUS_1_HLOCK	BIT(6)
+#define ISL7998X_REG_PX_DEC_STATUS_1_VLOCK	BIT(3)
+#define ISL7998X_REG_PX_DEC_HS_DELAY_CTL(pg)	ISL7998X_REG((pg), 0x04)
+#define ISL7998X_REG_PX_DEC_ANCTL(pg)		ISL7998X_REG((pg), 0x06)
+#define ISL7998X_REG_PX_DEC_CROP_HI(pg)		ISL7998X_REG((pg), 0x07)
+#define ISL7998X_REG_PX_DEC_VDELAY_LO(pg)	ISL7998X_REG((pg), 0x08)
+#define ISL7998X_REG_PX_DEC_VACTIVE_LO(pg)	ISL7998X_REG((pg), 0x09)
+#define ISL7998X_REG_PX_DEC_HDELAY_LO(pg)	ISL7998X_REG((pg), 0x0a)
+#define ISL7998X_REG_PX_DEC_HACTIVE_LO(pg)	ISL7998X_REG((pg), 0x0b)
+#define ISL7998X_REG_PX_DEC_CNTRL1(pg)		ISL7998X_REG((pg), 0x0c)
+#define ISL7998X_REG_PX_DEC_CSC_CTL(pg)		ISL7998X_REG((pg), 0x0d)
+#define ISL7998X_REG_PX_DEC_BRIGHT(pg)		ISL7998X_REG((pg), 0x10)
+#define ISL7998X_REG_PX_DEC_CONTRAST(pg)	ISL7998X_REG((pg), 0x11)
+#define ISL7998X_REG_PX_DEC_SHARPNESS(pg)	ISL7998X_REG((pg), 0x12)
+#define ISL7998X_REG_PX_DEC_SAT_U(pg)		ISL7998X_REG((pg), 0x13)
+#define ISL7998X_REG_PX_DEC_SAT_V(pg)		ISL7998X_REG((pg), 0x14)
+#define ISL7998X_REG_PX_DEC_HUE(pg)		ISL7998X_REG((pg), 0x15)
+#define ISL7998X_REG_PX_DEC_VERT_PEAK(pg)	ISL7998X_REG((pg), 0x17)
+#define ISL7998X_REG_PX_DEC_CORING(pg)		ISL7998X_REG((pg), 0x18)
+#define ISL7998X_REG_PX_DEC_SDT(pg)		ISL7998X_REG((pg), 0x1c)
+#define ISL7998X_REG_PX_DEC_SDT_DET		BIT(7)
+#define ISL7998X_REG_PX_DEC_SDT_NOW		GENMASK(6, 4)
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD	GENMASK(2, 0)
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_NTSC_M		0
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL		1
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_SECAM		2
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_NTSC_443	3
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_M		4
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_CN		5
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_60		6
+#define ISL7998X_REG_PX_DEC_SDT_STANDARD_UNKNOWN	7
+#define ISL7998X_REG_PX_DEC_SDTR(pg)		ISL7998X_REG((pg), 0x1d)
+#define ISL7998X_REG_PX_DEC_SDTR_ATSTART	BIT(7)
+#define ISL7998X_REG_PX_DEC_CLMPG(pg)		ISL7998X_REG((pg), 0x20)
+#define ISL7998X_REG_PX_DEC_IAGC(pg)		ISL7998X_REG((pg), 0x21)
+#define ISL7998X_REG_PX_DEC_AGCGAIN(pg)		ISL7998X_REG((pg), 0x22)
+#define ISL7998X_REG_PX_DEC_PEAKWT(pg)		ISL7998X_REG((pg), 0x23)
+#define ISL7998X_REG_PX_DEC_CLMPL(pg)		ISL7998X_REG((pg), 0x24)
+#define ISL7998X_REG_PX_DEC_SYNCT(pg)		ISL7998X_REG((pg), 0x25)
+#define ISL7998X_REG_PX_DEC_MISSCNT(pg)		ISL7998X_REG((pg), 0x26)
+#define ISL7998X_REG_PX_DEC_PCLAMP(pg)		ISL7998X_REG((pg), 0x27)
+#define ISL7998X_REG_PX_DEC_VERT_CTL_1(pg)	ISL7998X_REG((pg), 0x28)
+#define ISL7998X_REG_PX_DEC_VERT_CTL_2(pg)	ISL7998X_REG((pg), 0x29)
+#define ISL7998X_REG_PX_DEC_CLR_KILL_LVL(pg)	ISL7998X_REG((pg), 0x2a)
+#define ISL7998X_REG_PX_DEC_COMB_FILTER_CTL(pg)	ISL7998X_REG((pg), 0x2b)
+#define ISL7998X_REG_PX_DEC_LUMA_DELAY(pg)	ISL7998X_REG((pg), 0x2c)
+#define ISL7998X_REG_PX_DEC_MISC1(pg)		ISL7998X_REG((pg), 0x2d)
+#define ISL7998X_REG_PX_DEC_MISC2(pg)		ISL7998X_REG((pg), 0x2e)
+#define ISL7998X_REG_PX_DEC_MISC3(pg)		ISL7998X_REG((pg), 0x2f)
+#define ISL7998X_REG_PX_DEC_MVSN(pg)		ISL7998X_REG((pg), 0x30)
+#define ISL7998X_REG_PX_DEC_CSTATUS2(pg)	ISL7998X_REG((pg), 0x31)
+#define ISL7998X_REG_PX_DEC_HFREF(pg)		ISL7998X_REG((pg), 0x32)
+#define ISL7998X_REG_PX_DEC_CLMD(pg)		ISL7998X_REG((pg), 0x33)
+#define ISL7998X_REG_PX_DEC_ID_DET_CTL(pg)	ISL7998X_REG((pg), 0x34)
+#define ISL7998X_REG_PX_DEC_CLCNTL(pg)		ISL7998X_REG((pg), 0x35)
+#define ISL7998X_REG_PX_DEC_DIFF_CLMP_CTL_1(pg)	ISL7998X_REG((pg), 0x36)
+#define ISL7998X_REG_PX_DEC_DIFF_CLMP_CTL_2(pg)	ISL7998X_REG((pg), 0x37)
+#define ISL7998X_REG_PX_DEC_DIFF_CLMP_CTL_3(pg)	ISL7998X_REG((pg), 0x38)
+#define ISL7998X_REG_PX_DEC_DIFF_CLMP_CTL_4(pg)	ISL7998X_REG((pg), 0x39)
+#define ISL7998X_REG_PX_DEC_SHORT_DET_CTL(pg)	ISL7998X_REG((pg), 0x3a)
+#define ISL7998X_REG_PX_DEC_SHORT_DET_CTL_1(pg)	ISL7998X_REG((pg), 0x3b)
+#define ISL7998X_REG_PX_DEC_AFE_TST_MUX_CTL(pg)	ISL7998X_REG((pg), 0x3c)
+#define ISL7998X_REG_PX_DEC_DATA_CONV(pg)	ISL7998X_REG((pg), 0x3d)
+#define ISL7998X_REG_PX_DEC_INTERNAL_TEST(pg)	ISL7998X_REG((pg), 0x3f)
+#define ISL7998X_REG_PX_DEC_H_DELAY_CTL(pg)	ISL7998X_REG((pg), 0x43)
+#define ISL7998X_REG_PX_DEC_H_DELAY_II_HI(pg)	ISL7998X_REG((pg), 0x44)
+#define ISL7998X_REG_PX_DEC_H_DELAY_II_LOW(pg)	ISL7998X_REG((pg), 0x45)
+
+#define ISL7998X_REG_PX_ACA_CTL_1(pg)		ISL7998X_REG((pg), 0x80)
+#define ISL7998X_REG_PX_ACA_GAIN_CTL(pg)	ISL7998X_REG((pg), 0x81)
+#define ISL7998X_REG_PX_ACA_Y_AVG_HI_LIMIT(pg)	ISL7998X_REG((pg), 0x82)
+#define ISL7998X_REG_PX_ACA_Y_AVG_LO_LIMIT(pg)	ISL7998X_REG((pg), 0x83)
+#define ISL7998X_REG_PX_ACA_Y_DET_THRESHOLD(pg)	ISL7998X_REG((pg), 0x84)
+#define ISL7998X_REG_PX_ACA_BLACK_LVL(pg)	ISL7998X_REG((pg), 0x85)
+#define ISL7998X_REG_PX_ACA_CENTER_LVL(pg)	ISL7998X_REG((pg), 0x86)
+#define ISL7998X_REG_PX_ACA_WHITE_LVL(pg)	ISL7998X_REG((pg), 0x87)
+#define ISL7998X_REG_PX_ACA_MEAN_OFF_LIMIT(pg)	ISL7998X_REG((pg), 0x88)
+#define ISL7998X_REG_PX_ACA_MEAN_OFF_UPGAIN(pg)	ISL7998X_REG((pg), 0x89)
+#define ISL7998X_REG_PX_ACA_MEAN_OFF_SLOPE(pg)	ISL7998X_REG((pg), 0x8a)
+#define ISL7998X_REG_PX_ACA_MEAN_OFF_DNGAIN(pg)	ISL7998X_REG((pg), 0x8b)
+#define ISL7998X_REG_PX_ACA_DELTA_CO_THRES(pg)	ISL7998X_REG((pg), 0x8c)
+#define ISL7998X_REG_PX_ACA_DELTA_SLOPE(pg)	ISL7998X_REG((pg), 0x8d)
+#define ISL7998X_REG_PX_ACA_LO_HI_AVG_THRES(pg)	ISL7998X_REG((pg), 0x8e)
+#define ISL7998X_REG_PX_ACA_LO_MAX_LVL_CTL(pg)	ISL7998X_REG((pg), 0x8f)
+#define ISL7998X_REG_PX_ACA_HI_MAX_LVL_CTL(pg)	ISL7998X_REG((pg), 0x90)
+#define ISL7998X_REG_PX_ACA_LO_UPGAIN_CTL(pg)	ISL7998X_REG((pg), 0x91)
+#define ISL7998X_REG_PX_ACA_LO_DNGAIN_CTL(pg)	ISL7998X_REG((pg), 0x92)
+#define ISL7998X_REG_PX_ACA_HI_UPGAIN_CTL(pg)	ISL7998X_REG((pg), 0x93)
+#define ISL7998X_REG_PX_ACA_HI_DNGAIN_CTL(pg)	ISL7998X_REG((pg), 0x94)
+#define ISL7998X_REG_PX_ACA_LOPASS_FLT_COEF(pg)	ISL7998X_REG((pg), 0x95)
+#define ISL7998X_REG_PX_ACA_PDF_INDEX(pg)	ISL7998X_REG((pg), 0x96)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_H_STT(pg)	ISL7998X_REG((pg), 0x97)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_H_SZ1(pg)	ISL7998X_REG((pg), 0x98)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_H_SZ2(pg)	ISL7998X_REG((pg), 0x99)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_V_STT(pg)	ISL7998X_REG((pg), 0x9a)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_V_SZ1(pg)	ISL7998X_REG((pg), 0x9b)
+#define ISL7998X_REG_PX_ACA_HIST_WIN_V_SZ2(pg)	ISL7998X_REG((pg), 0x9c)
+#define ISL7998X_REG_PX_ACA_Y_AVG(pg)		ISL7998X_REG((pg), 0xa0)
+#define ISL7998X_REG_PX_ACA_Y_AVG_LIM(pg)	ISL7998X_REG((pg), 0xa1)
+#define ISL7998X_REG_PX_ACA_LO_AVG(pg)		ISL7998X_REG((pg), 0xa2)
+#define ISL7998X_REG_PX_ACA_HI_AVG(pg)		ISL7998X_REG((pg), 0xa3)
+#define ISL7998X_REG_PX_ACA_Y_MAX(pg)		ISL7998X_REG((pg), 0xa4)
+#define ISL7998X_REG_PX_ACA_Y_MIN(pg)		ISL7998X_REG((pg), 0xa5)
+#define ISL7998X_REG_PX_ACA_MOFFSET(pg)		ISL7998X_REG((pg), 0xa6)
+#define ISL7998X_REG_PX_ACA_LO_GAIN(pg)		ISL7998X_REG((pg), 0xa7)
+#define ISL7998X_REG_PX_ACA_HI_GAIN(pg)		ISL7998X_REG((pg), 0xa8)
+#define ISL7998X_REG_PX_ACA_LL_SLOPE(pg)	ISL7998X_REG((pg), 0xa9)
+#define ISL7998X_REG_PX_ACA_LH_SLOPE(pg)	ISL7998X_REG((pg), 0xaa)
+#define ISL7998X_REG_PX_ACA_HL_SLOPE(pg)	ISL7998X_REG((pg), 0xab)
+#define ISL7998X_REG_PX_ACA_HH_SLOPE(pg)	ISL7998X_REG((pg), 0xac)
+#define ISL7998X_REG_PX_ACA_X_LOW(pg)		ISL7998X_REG((pg), 0xad)
+#define ISL7998X_REG_PX_ACA_X_MEAN(pg)		ISL7998X_REG((pg), 0xae)
+#define ISL7998X_REG_PX_ACA_X_HIGH(pg)		ISL7998X_REG((pg), 0xaf)
+#define ISL7998X_REG_PX_ACA_Y_LOW(pg)		ISL7998X_REG((pg), 0xb0)
+#define ISL7998X_REG_PX_ACA_Y_MEAN(pg)		ISL7998X_REG((pg), 0xb1)
+#define ISL7998X_REG_PX_ACA_Y_HIGH(pg)		ISL7998X_REG((pg), 0xb2)
+#define ISL7998X_REG_PX_ACA_CTL_2(pg)		ISL7998X_REG((pg), 0xb3)
+#define ISL7998X_REG_PX_ACA_CTL_3(pg)		ISL7998X_REG((pg), 0xb4)
+#define ISL7998X_REG_PX_ACA_CTL_4(pg)		ISL7998X_REG((pg), 0xb5)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_HIST(pg)	ISL7998X_REG((pg), 0xc0)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TL_H(pg)	ISL7998X_REG((pg), 0xc1)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TL_L(pg)	ISL7998X_REG((pg), 0xc2)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TL_H(pg)	ISL7998X_REG((pg), 0xc3)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TL_L(pg)	ISL7998X_REG((pg), 0xc4)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TR_H(pg)	ISL7998X_REG((pg), 0xc5)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TR_L(pg)	ISL7998X_REG((pg), 0xc6)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TR_H(pg)	ISL7998X_REG((pg), 0xc7)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TR_L(pg)	ISL7998X_REG((pg), 0xc8)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BL_H(pg)	ISL7998X_REG((pg), 0xc9)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BL_L(pg)	ISL7998X_REG((pg), 0xca)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BL_H(pg)	ISL7998X_REG((pg), 0xcb)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BL_L(pg)	ISL7998X_REG((pg), 0xcc)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BR_H(pg)	ISL7998X_REG((pg), 0xcd)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BR_L(pg)	ISL7998X_REG((pg), 0xce)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BR_H(pg)	ISL7998X_REG((pg), 0xcf)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BR_L(pg)	ISL7998X_REG((pg), 0xd0)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_LM_H(pg)	ISL7998X_REG((pg), 0xd1)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_LM_L(pg)	ISL7998X_REG((pg), 0xd2)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_LM_H(pg)	ISL7998X_REG((pg), 0xd3)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_LM_L(pg)	ISL7998X_REG((pg), 0xd4)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TM_H(pg)	ISL7998X_REG((pg), 0xd5)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_TM_L(pg)	ISL7998X_REG((pg), 0xd6)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TM_H(pg)	ISL7998X_REG((pg), 0xd7)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_TM_L(pg)	ISL7998X_REG((pg), 0xd8)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BM_H(pg)	ISL7998X_REG((pg), 0xd9)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_BM_L(pg)	ISL7998X_REG((pg), 0xda)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BM_H(pg)	ISL7998X_REG((pg), 0xdb)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_BM_L(pg)	ISL7998X_REG((pg), 0xdc)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_RM_H(pg)	ISL7998X_REG((pg), 0xdd)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_X_RM_L(pg)	ISL7998X_REG((pg), 0xde)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_RM_H(pg)	ISL7998X_REG((pg), 0xdf)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_RM_L(pg)	ISL7998X_REG((pg), 0xe0)
+#define ISL7998X_REG_PX_ACA_HIST_DATA_LO(pg)	ISL7998X_REG((pg), 0xe1)
+#define ISL7998X_REG_PX_ACA_HIST_DATA_MID(pg)	ISL7998X_REG((pg), 0xe2)
+#define ISL7998X_REG_PX_ACA_HIST_DATA_HI(pg)	ISL7998X_REG((pg), 0xe3)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_Y_CLR(pg)	ISL7998X_REG((pg), 0xe4)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_CB_CLR(pg)	ISL7998X_REG((pg), 0xe5)
+#define ISL7998X_REG_PX_ACA_FLEX_WIN_CR_CLR(pg)	ISL7998X_REG((pg), 0xe6)
+#define ISL7998X_REG_PX_ACA_XFER_HIST_HOST(pg)	ISL7998X_REG((pg), 0xe7)
+
+#define ISL7998X_REG_P5_LI_ENGINE_CTL		ISL7998X_REG(5, 0x00)
+#define ISL7998X_REG_P5_LI_ENGINE_LINE_CTL	ISL7998X_REG(5, 0x01)
+#define ISL7998X_REG_P5_LI_ENGINE_PIC_WIDTH	ISL7998X_REG(5, 0x02)
+#define ISL7998X_REG_P5_LI_ENGINE_SYNC_CTL	ISL7998X_REG(5, 0x03)
+#define ISL7998X_REG_P5_LI_ENGINE_VC_ASSIGNMENT	ISL7998X_REG(5, 0x04)
+#define ISL7998X_REG_P5_LI_ENGINE_TYPE_CTL	ISL7998X_REG(5, 0x05)
+#define ISL7998X_REG_P5_LI_ENGINE_FIFO_CTL	ISL7998X_REG(5, 0x06)
+#define ISL7998X_REG_P5_MIPI_READ_START_CTL	ISL7998X_REG(5, 0x07)
+#define ISL7998X_REG_P5_PSEUDO_FRM_FIELD_CTL	ISL7998X_REG(5, 0x08)
+#define ISL7998X_REG_P5_ONE_FIELD_MODE_CTL	ISL7998X_REG(5, 0x09)
+#define ISL7998X_REG_P5_MIPI_INT_HW_TST_CTR	ISL7998X_REG(5, 0x0a)
+#define ISL7998X_REG_P5_TP_GEN_BAR_PATTERN	ISL7998X_REG(5, 0x0b)
+#define ISL7998X_REG_P5_MIPI_PCNT_PSFRM		ISL7998X_REG(5, 0x0c)
+#define ISL7998X_REG_P5_LI_ENGINE_TP_GEN_CTL	ISL7998X_REG(5, 0x0d)
+#define ISL7998X_REG_P5_MIPI_VBLANK_PSFRM	ISL7998X_REG(5, 0x0e)
+#define ISL7998X_REG_P5_LI_ENGINE_CTL_2		ISL7998X_REG(5, 0x0f)
+#define ISL7998X_REG_P5_MIPI_WCNT_1		ISL7998X_REG(5, 0x10)
+#define ISL7998X_REG_P5_MIPI_WCNT_2		ISL7998X_REG(5, 0x11)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_1	ISL7998X_REG(5, 0x12)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_2	ISL7998X_REG(5, 0x13)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_3	ISL7998X_REG(5, 0x14)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_4	ISL7998X_REG(5, 0x15)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_5	ISL7998X_REG(5, 0x16)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_6	ISL7998X_REG(5, 0x17)
+#define ISL7998X_REG_P5_MIPI_DPHY_PARAMS_1	ISL7998X_REG(5, 0x18)
+#define ISL7998X_REG_P5_MIPI_DPHY_SOT_PERIOD	ISL7998X_REG(5, 0x19)
+#define ISL7998X_REG_P5_MIPI_DPHY_EOT_PERIOD	ISL7998X_REG(5, 0x1a)
+#define ISL7998X_REG_P5_MIPI_DPHY_PARAMS_2	ISL7998X_REG(5, 0x1b)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_7	ISL7998X_REG(5, 0x1c)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_8	ISL7998X_REG(5, 0x1d)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_9	ISL7998X_REG(5, 0x1e)
+#define ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_10	ISL7998X_REG(5, 0x1f)
+#define ISL7998X_REG_P5_TP_GEN_MIPI		ISL7998X_REG(5, 0x20)
+#define ISL7998X_REG_P5_ESC_MODE_TIME_CTL	ISL7998X_REG(5, 0x21)
+#define ISL7998X_REG_P5_AUTO_TEST_ERR_DET	ISL7998X_REG(5, 0x22)
+#define ISL7998X_REG_P5_MIPI_TIMING		ISL7998X_REG(5, 0x23)
+#define ISL7998X_REG_P5_PIC_HEIGHT_HIGH		ISL7998X_REG(5, 0x24)
+#define ISL7998X_REG_P5_PIC_HEIGHT_LOW		ISL7998X_REG(5, 0x25)
+#define ISL7998X_REG_P5_MIPI_SP_HS_TRL_CTL	ISL7998X_REG(5, 0x26)
+#define ISL7998X_REG_P5_FIFO_THRSH_CNT_1	ISL7998X_REG(5, 0x28)
+#define ISL7998X_REG_P5_FIFO_THRSH_CNT_2	ISL7998X_REG(5, 0x29)
+#define ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_1	ISL7998X_REG(5, 0x2a)
+#define ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_2	ISL7998X_REG(5, 0x2b)
+#define ISL7998X_REG_P5_PSF_FIELD_END_CTL_1	ISL7998X_REG(5, 0x2c)
+#define ISL7998X_REG_P5_PSF_FIELD_END_CTL_2	ISL7998X_REG(5, 0x2d)
+#define ISL7998X_REG_P5_PSF_FIELD_END_CTL_3	ISL7998X_REG(5, 0x2e)
+#define ISL7998X_REG_P5_PSF_FIELD_END_CTL_4	ISL7998X_REG(5, 0x2f)
+#define ISL7998X_REG_P5_MIPI_ANA_DATA_CTL_1	ISL7998X_REG(5, 0x30)
+#define ISL7998X_REG_P5_MIPI_ANA_DATA_CTL_2	ISL7998X_REG(5, 0x31)
+#define ISL7998X_REG_P5_MIPI_ANA_CLK_CTL	ISL7998X_REG(5, 0x32)
+#define ISL7998X_REG_P5_PLL_ANA_STATUS		ISL7998X_REG(5, 0x33)
+#define ISL7998X_REG_P5_PLL_ANA_MISC_CTL	ISL7998X_REG(5, 0x34)
+#define ISL7998X_REG_P5_MIPI_ANA		ISL7998X_REG(5, 0x35)
+#define ISL7998X_REG_P5_PLL_ANA			ISL7998X_REG(5, 0x36)
+#define ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_1	ISL7998X_REG(5, 0x38)
+#define ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_2	ISL7998X_REG(5, 0x39)
+#define ISL7998X_REG_P5_H_LINE_CNT_1		ISL7998X_REG(5, 0x3a)
+#define ISL7998X_REG_P5_H_LINE_CNT_2		ISL7998X_REG(5, 0x3b)
+#define ISL7998X_REG_P5_HIST_LINE_CNT_1		ISL7998X_REG(5, 0x3c)
+#define ISL7998X_REG_P5_HIST_LINE_CNT_2		ISL7998X_REG(5, 0x3d)
+
+static const struct reg_sequence isl7998x_init_seq_1[] = {
+	{ ISL7998X_REG_P0_SHORT_DIAG_IRQ_EN, 0xff },
+	{ ISL7998X_REG_PX_DEC_SDT(0x1), 0x00 },
+	{ ISL7998X_REG_PX_DEC_SHORT_DET_CTL_1(0x1), 0x03 },
+	{ ISL7998X_REG_PX_DEC_SDT(0x2), 0x00 },
+	{ ISL7998X_REG_PX_DEC_SHORT_DET_CTL_1(0x2), 0x03 },
+	{ ISL7998X_REG_PX_DEC_SDT(0x3), 0x00 },
+	{ ISL7998X_REG_PX_DEC_SHORT_DET_CTL_1(0x3), 0x03 },
+	{ ISL7998X_REG_PX_DEC_SDT(0x4), 0x00 },
+	{ ISL7998X_REG_PX_DEC_SHORT_DET_CTL_1(0x4), 0x03 },
+	{ ISL7998X_REG_P5_LI_ENGINE_CTL, 0x00 },
+	{ ISL7998X_REG_P0_SW_RESET_CTL, 0x1f, 10 },
+	{ ISL7998X_REG_P0_IO_BUFFER_CTL, 0x00 },
+	{ ISL7998X_REG_P0_MPP2_SYNC_CTL, 0xc9 },
+	{ ISL7998X_REG_P0_IRQ_SYNC_CTL, 0xc9 },
+	{ ISL7998X_REG_P0_CHAN_1_IRQ, 0x03 },
+	{ ISL7998X_REG_P0_CHAN_2_IRQ, 0x00 },
+	{ ISL7998X_REG_P0_CHAN_3_IRQ, 0x00 },
+	{ ISL7998X_REG_P0_CHAN_4_IRQ, 0x00 },
+	{ ISL7998X_REG_P5_LI_ENGINE_CTL, 0x02 },
+	{ ISL7998X_REG_P5_LI_ENGINE_LINE_CTL, 0x85 },
+	{ ISL7998X_REG_P5_LI_ENGINE_PIC_WIDTH, 0xa0 },
+	{ ISL7998X_REG_P5_LI_ENGINE_SYNC_CTL, 0x18 },
+	{ ISL7998X_REG_P5_LI_ENGINE_TYPE_CTL, 0x40 },
+	{ ISL7998X_REG_P5_LI_ENGINE_FIFO_CTL, 0x40 },
+	{ ISL7998X_REG_P5_MIPI_WCNT_1, 0x05 },
+	{ ISL7998X_REG_P5_MIPI_WCNT_2, 0xa0 },
+	{ ISL7998X_REG_P5_TP_GEN_MIPI, 0x00 },
+	{ ISL7998X_REG_P5_ESC_MODE_TIME_CTL, 0x0c },
+	{ ISL7998X_REG_P5_MIPI_SP_HS_TRL_CTL, 0x00 },
+	{ ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_1, 0x00 },
+	{ ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_2, 0x19 },
+	{ ISL7998X_REG_P5_PSF_FIELD_END_CTL_1, 0x18 },
+	{ ISL7998X_REG_P5_PSF_FIELD_END_CTL_2, 0xf1 },
+	{ ISL7998X_REG_P5_PSF_FIELD_END_CTL_3, 0x00 },
+	{ ISL7998X_REG_P5_PSF_FIELD_END_CTL_4, 0xf1 },
+	{ ISL7998X_REG_P5_MIPI_ANA_DATA_CTL_1, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_ANA_DATA_CTL_2, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_ANA_CLK_CTL, 0x00 },
+	{ ISL7998X_REG_P5_PLL_ANA_STATUS, 0xc0 },
+	{ ISL7998X_REG_P5_PLL_ANA_MISC_CTL, 0x18 },
+	{ ISL7998X_REG_P5_PLL_ANA, 0x00 },
+	{ ISL7998X_REG_P0_SW_RESET_CTL, 0x10, 10 },
+	/* Page 0xf means write to all of pages 1,2,3,4 */
+	{ ISL7998X_REG_PX_DEC_VDELAY_LO(0xf), 0x14 },
+	{ ISL7998X_REG_PX_DEC_MISC3(0xf), 0xe6 },
+	{ ISL7998X_REG_PX_DEC_CLMD(0xf), 0x85 },
+	{ ISL7998X_REG_PX_DEC_H_DELAY_II_LOW(0xf), 0x11 },
+	{ ISL7998X_REG_PX_ACA_XFER_HIST_HOST(0xf), 0x00 },
+	{ ISL7998X_REG_P0_CLK_CTL_1, 0x1f },
+	{ ISL7998X_REG_P0_CLK_CTL_2, 0x43 },
+	{ ISL7998X_REG_P0_CLK_CTL_3, 0x4f },
+};
+
+static const struct reg_sequence isl7998x_init_seq_2[] = {
+	{ ISL7998X_REG_P5_LI_ENGINE_SYNC_CTL, 0x10 },
+	{ ISL7998X_REG_P5_LI_ENGINE_VC_ASSIGNMENT, 0xe4 },
+	{ ISL7998X_REG_P5_LI_ENGINE_TYPE_CTL, 0x00 },
+	{ ISL7998X_REG_P5_LI_ENGINE_FIFO_CTL, 0x60 },
+	{ ISL7998X_REG_P5_MIPI_READ_START_CTL, 0x2b },
+	{ ISL7998X_REG_P5_PSEUDO_FRM_FIELD_CTL, 0x02 },
+	{ ISL7998X_REG_P5_ONE_FIELD_MODE_CTL, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_INT_HW_TST_CTR, 0x62 },
+	{ ISL7998X_REG_P5_TP_GEN_BAR_PATTERN, 0x02 },
+	{ ISL7998X_REG_P5_MIPI_PCNT_PSFRM, 0x36 },
+	{ ISL7998X_REG_P5_LI_ENGINE_TP_GEN_CTL, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_VBLANK_PSFRM, 0x6c },
+	{ ISL7998X_REG_P5_LI_ENGINE_CTL_2, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_WCNT_1, 0x05 },
+	{ ISL7998X_REG_P5_MIPI_WCNT_2, 0xa0 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_1, 0x77 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_2, 0x17 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_3, 0x08 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_4, 0x38 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_5, 0x14 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_6, 0xf6 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_PARAMS_1, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_SOT_PERIOD, 0x17 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_EOT_PERIOD, 0x0a },
+	{ ISL7998X_REG_P5_MIPI_DPHY_PARAMS_2, 0x71 },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_7, 0x7a },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_8, 0x0f },
+	{ ISL7998X_REG_P5_MIPI_DPHY_TIMING_CTL_9, 0x8c },
+	{ ISL7998X_REG_P5_MIPI_SP_HS_TRL_CTL, 0x08 },
+	{ ISL7998X_REG_P5_FIFO_THRSH_CNT_1, 0x01 },
+	{ ISL7998X_REG_P5_FIFO_THRSH_CNT_2, 0x0e },
+	{ ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_1, 0x00 },
+	{ ISL7998X_REG_P5_TP_GEN_RND_SYNC_CTL_2, 0x00 },
+	{ ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_1, 0x03 },
+	{ ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_2, 0xc0 },
+	{ ISL7998X_REG_P5_H_LINE_CNT_1, 0x06 },
+	{ ISL7998X_REG_P5_H_LINE_CNT_2, 0xb3 },
+	{ ISL7998X_REG_P5_HIST_LINE_CNT_1, 0x00 },
+	{ ISL7998X_REG_P5_HIST_LINE_CNT_2, 0xf1 },
+	{ ISL7998X_REG_P5_LI_ENGINE_FIFO_CTL, 0x00 },
+	{ ISL7998X_REG_P5_MIPI_ANA, 0x00 },
+	/*
+	 * Wait a bit after reset so that the chip can capture a frame
+	 * and update internal line counters.
+	 */
+	{ ISL7998X_REG_P0_SW_RESET_CTL, 0x00, 50 },
+};
+
+enum isl7998x_pads {
+	ISL7998X_PAD_OUT,
+	ISL7998X_PAD_VIN1,
+	ISL7998X_PAD_VIN2,
+	ISL7998X_PAD_VIN3,
+	ISL7998X_PAD_VIN4,
+	ISL7998X_NUM_PADS
+};
+
+struct isl7998x_datafmt {
+	u32			code;
+	enum v4l2_colorspace	colorspace;
+};
+
+static const struct isl7998x_datafmt isl7998x_colour_fmts[] = {
+	{ MEDIA_BUS_FMT_UYVY8_2X8, V4L2_COLORSPACE_SRGB },
+};
+
+/* Menu items for LINK_FREQ V4L2 control */
+static const s64 link_freq_menu_items[] = {
+	/* 1 channel, 1 lane or 2 channels, 2 lanes */
+	108000000,
+	/* 2 channels, 1 lane or 4 channels, 2 lanes */
+	216000000,
+	/* 4 channels, 1 lane */
+	432000000,
+};
+
+/* Menu items for TEST_PATTERN V4L2 control */
+static const char * const isl7998x_test_pattern_menu[] = {
+	"Disabled",
+	"Enabled",
+};
+
+static const char * const isl7998x_test_pattern_bars[] = {
+	"bbbbwb", "bbbwwb", "bbwbwb", "bbwwwb",
+};
+
+static const char * const isl7998x_test_pattern_colors[] = {
+	"Yellow", "Blue", "Green", "Pink",
+};
+
+struct isl7998x_mode {
+	unsigned int width;
+	unsigned int height;
+	enum v4l2_field field;
+};
+
+static const struct isl7998x_mode supported_modes[] = {
+	{
+		.width = 720,
+		.height = 576,
+		.field = V4L2_FIELD_SEQ_TB,
+	},
+	{
+		.width = 720,
+		.height = 480,
+		.field = V4L2_FIELD_SEQ_BT,
+	},
+};
+
+static const struct isl7998x_video_std {
+	const v4l2_std_id norm;
+	unsigned int id;
+	const struct isl7998x_mode *mode;
+} isl7998x_std_res[] = {
+	{ V4L2_STD_NTSC_443,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_NTSC_443,
+	  &supported_modes[1] },
+	{ V4L2_STD_PAL_M,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_M,
+	  &supported_modes[1] },
+	{ V4L2_STD_PAL_Nc,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_CN,
+	  &supported_modes[0] },
+	{ V4L2_STD_PAL_N,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL,
+	  &supported_modes[0] },
+	{ V4L2_STD_PAL_60,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL_60,
+	  &supported_modes[1] },
+	{ V4L2_STD_NTSC,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_NTSC_M,
+	  &supported_modes[1] },
+	{ V4L2_STD_PAL,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_PAL,
+	  &supported_modes[0] },
+	{ V4L2_STD_SECAM,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_SECAM,
+	  &supported_modes[0] },
+	{ V4L2_STD_UNKNOWN,
+	  ISL7998X_REG_PX_DEC_SDT_STANDARD_UNKNOWN,
+	  &supported_modes[1] },
+};
+
+struct isl7998x {
+	struct v4l2_subdev		subdev;
+	struct regmap			*regmap;
+	struct gpio_desc		*pd_gpio;
+	struct gpio_desc		*rstb_gpio;
+	unsigned int			nr_mipi_lanes;
+	u32				nr_inputs;
+
+	const struct isl7998x_datafmt	*fmt;
+	v4l2_std_id			norm;
+	struct media_pad		pads[ISL7998X_NUM_PADS];
+
+	int				enabled;
+
+	/* protect fmt, norm, enabled */
+	struct mutex			lock;
+
+	struct v4l2_ctrl_handler	ctrl_handler;
+	/* protect ctrl_handler */
+	struct mutex			ctrl_mutex;
+
+	/* V4L2 Controls */
+	struct v4l2_ctrl		*link_freq;
+	u8				test_pattern;
+	u8				test_pattern_bars;
+	u8				test_pattern_chans;
+	u8				test_pattern_color;
+};
+
+static struct isl7998x *sd_to_isl7998x(struct v4l2_subdev *sd)
+{
+	return container_of(sd, struct isl7998x, subdev);
+}
+
+static struct isl7998x *i2c_to_isl7998x(const struct i2c_client *client)
+{
+	return sd_to_isl7998x(i2c_get_clientdata(client));
+}
+
+static unsigned int isl7998x_norm_to_val(v4l2_std_id norm)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(isl7998x_std_res); i++)
+		if (isl7998x_std_res[i].norm & norm)
+			break;
+	if (i == ARRAY_SIZE(isl7998x_std_res))
+		return ISL7998X_REG_PX_DEC_SDT_STANDARD_UNKNOWN;
+
+	return isl7998x_std_res[i].id;
+}
+
+static const struct isl7998x_mode *isl7998x_norm_to_mode(v4l2_std_id norm)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(isl7998x_std_res); i++)
+		if (isl7998x_std_res[i].norm & norm)
+			break;
+	/* Use NTSC default resolution during standard detection */
+	if (i == ARRAY_SIZE(isl7998x_std_res))
+		return &supported_modes[1];
+
+	return isl7998x_std_res[i].mode;
+}
+
+static int isl7998x_get_nr_inputs(struct device_node *of_node)
+{
+	struct device_node *port;
+	unsigned int inputs = 0;
+	unsigned int i;
+
+	if (of_graph_get_endpoint_count(of_node) > ISL7998X_NUM_PADS)
+		return -EINVAL;
+
+	/*
+	 * The driver does not provide means to remap the input ports. It
+	 * always configures input ports to start from VID1. Ensure that the
+	 * device tree is correct.
+	 */
+	for (i = ISL7998X_PAD_VIN1; i <= ISL7998X_PAD_VIN4; i++) {
+		port = of_graph_get_port_by_id(of_node, i);
+		if (!port)
+			continue;
+
+		inputs |= BIT(i);
+		of_node_put(port);
+	}
+
+	switch (inputs) {
+	case BIT(ISL7998X_PAD_VIN1):
+		return 1;
+	case BIT(ISL7998X_PAD_VIN1) | BIT(ISL7998X_PAD_VIN2):
+		return 2;
+	case BIT(ISL7998X_PAD_VIN1) | BIT(ISL7998X_PAD_VIN2) |
+	     BIT(ISL7998X_PAD_VIN3) | BIT(ISL7998X_PAD_VIN4):
+		return 4;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int isl7998x_wait_power_on(struct isl7998x *isl7998x)
+{
+	struct device *dev = isl7998x->subdev.dev;
+	u32 chip_id;
+	int ret;
+	int err;
+
+	ret = read_poll_timeout(regmap_read, err, !err, 2000, 20000, false,
+				isl7998x->regmap,
+				ISL7998X_REG_P0_PRODUCT_ID_CODE, &chip_id);
+	if (ret) {
+		dev_err(dev, "timeout while waiting for ISL7998X\n");
+		return ret;
+	}
+
+	dev_dbg(dev, "Found ISL799%x\n", chip_id);
+
+	return ret;
+}
+
+static int isl7998x_set_standard(struct isl7998x *isl7998x, v4l2_std_id norm)
+{
+	const struct isl7998x_mode *mode = isl7998x_norm_to_mode(norm);
+	unsigned int val = isl7998x_norm_to_val(norm);
+	unsigned int width = mode->width;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < ISL7998X_INPUTS; i++) {
+		ret = regmap_write_bits(isl7998x->regmap,
+					ISL7998X_REG_PX_DEC_SDT(i + 1),
+					ISL7998X_REG_PX_DEC_SDT_STANDARD,
+					val);
+		if (ret)
+			return ret;
+	}
+
+	ret = regmap_write(isl7998x->regmap,
+			   ISL7998X_REG_P5_LI_ENGINE_LINE_CTL,
+			   0x20 | ((width >> 7) & 0x1f));
+	if (ret)
+		return ret;
+
+	ret = regmap_write(isl7998x->regmap,
+			   ISL7998X_REG_P5_LI_ENGINE_PIC_WIDTH,
+			   (width << 1) & 0xff);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int isl7998x_init(struct isl7998x *isl7998x)
+{
+	const unsigned int lanes = isl7998x->nr_mipi_lanes;
+	const u32 isl7998x_video_in_chan_map[] = { 0x00, 0x11, 0x02, 0x02 };
+	const struct reg_sequence isl7998x_init_seq_custom[] = {
+		{ ISL7998X_REG_P0_VIDEO_IN_CHAN_CTL,
+		  isl7998x_video_in_chan_map[isl7998x->nr_inputs - 1] },
+		{ ISL7998X_REG_P0_CLK_CTL_4,
+		  (lanes == 1) ? 0x40 : 0x41 },
+		{ ISL7998X_REG_P5_LI_ENGINE_CTL,
+		  (lanes == 1) ? 0x01 : 0x02 },
+	};
+	struct device *dev = isl7998x->subdev.dev;
+	struct regmap *regmap = isl7998x->regmap;
+	int ret;
+
+	dev_dbg(dev, "configuring %d lanes for %d inputs (norm %s)\n",
+		isl7998x->nr_mipi_lanes, isl7998x->nr_inputs,
+		v4l2_norm_to_name(isl7998x->norm));
+
+	ret = regmap_register_patch(regmap, isl7998x_init_seq_1,
+				    ARRAY_SIZE(isl7998x_init_seq_1));
+	if (ret)
+		return ret;
+
+	mutex_lock(&isl7998x->lock);
+	ret = isl7998x_set_standard(isl7998x, isl7998x->norm);
+	mutex_unlock(&isl7998x->lock);
+	if (ret)
+		return ret;
+
+	ret = regmap_register_patch(regmap, isl7998x_init_seq_custom,
+				    ARRAY_SIZE(isl7998x_init_seq_custom));
+	if (ret)
+		return ret;
+
+	return regmap_register_patch(regmap, isl7998x_init_seq_2,
+				     ARRAY_SIZE(isl7998x_init_seq_2));
+}
+
+static int isl7998x_set_test_pattern(struct isl7998x *isl7998x)
+{
+	const struct reg_sequence isl7998x_init_seq_tpg_off[] = {
+		{ ISL7998X_REG_P5_LI_ENGINE_TP_GEN_CTL, 0 },
+		{ ISL7998X_REG_P5_LI_ENGINE_CTL_2, 0 }
+	};
+	const struct reg_sequence isl7998x_init_seq_tpg_on[] = {
+		{ ISL7998X_REG_P5_TP_GEN_BAR_PATTERN,
+		  isl7998x->test_pattern_bars << 6 },
+		{ ISL7998X_REG_P5_LI_ENGINE_CTL_2,
+		  isl7998x->norm & V4L2_STD_PAL ? BIT(2) : 0 },
+		{ ISL7998X_REG_P5_LI_ENGINE_TP_GEN_CTL,
+		  (isl7998x->test_pattern_chans << 4) |
+		  (isl7998x->test_pattern_color << 2) }
+	};
+	struct device *dev = isl7998x->subdev.dev;
+	struct regmap *regmap = isl7998x->regmap;
+	int ret;
+
+	if (pm_runtime_get_if_in_use(dev) <= 0)
+		return 0;
+
+	if (isl7998x->test_pattern != 0) {
+		dev_dbg(dev, "enabling test pattern: channels 0x%x, %s, %s\n",
+			isl7998x->test_pattern_chans,
+			isl7998x_test_pattern_bars[isl7998x->test_pattern_bars],
+			isl7998x_test_pattern_colors[isl7998x->test_pattern_color]);
+		ret = regmap_register_patch(regmap, isl7998x_init_seq_tpg_on,
+					    ARRAY_SIZE(isl7998x_init_seq_tpg_on));
+	} else {
+		ret = regmap_register_patch(regmap, isl7998x_init_seq_tpg_off,
+					    ARRAY_SIZE(isl7998x_init_seq_tpg_off));
+	}
+
+	pm_runtime_put(dev);
+
+	return ret;
+}
+
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+static int isl7998x_g_register(struct v4l2_subdev *sd,
+			       struct v4l2_dbg_register *reg)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	int ret;
+	u32 val;
+
+	ret = regmap_read(isl7998x->regmap, reg->reg, &val);
+	if (ret)
+		return ret;
+
+	reg->size = 1;
+	reg->val = val;
+
+	return 0;
+}
+
+static int isl7998x_s_register(struct v4l2_subdev *sd,
+			       const struct v4l2_dbg_register *reg)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+
+	return regmap_write(isl7998x->regmap, reg->reg, reg->val);
+}
+#endif
+
+static int isl7998x_g_std(struct v4l2_subdev *sd, v4l2_std_id *norm)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+
+	mutex_lock(&isl7998x->lock);
+	*norm = isl7998x->norm;
+	mutex_unlock(&isl7998x->lock);
+
+	return 0;
+}
+
+static int isl7998x_s_std(struct v4l2_subdev *sd, v4l2_std_id norm)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+	int ret = 0;
+
+	mutex_lock(&isl7998x->lock);
+	if (isl7998x->enabled) {
+		ret = -EBUSY;
+		mutex_unlock(&isl7998x->lock);
+		return ret;
+	}
+	isl7998x->norm = norm;
+	mutex_unlock(&isl7998x->lock);
+
+	if (pm_runtime_get_if_in_use(dev) <= 0)
+		return ret;
+
+	ret = isl7998x_set_standard(isl7998x, norm);
+
+	pm_runtime_put(dev);
+
+	return ret;
+}
+
+static int isl7998x_querystd(struct v4l2_subdev *sd, v4l2_std_id *std)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+	unsigned int std_id[ISL7998X_INPUTS];
+	unsigned int i;
+	int ret;
+	u32 reg;
+
+	ret = pm_runtime_resume_and_get(dev);
+	if (ret)
+		return ret;
+
+	dev_dbg(dev, "starting video standard detection\n");
+
+	mutex_lock(&isl7998x->lock);
+	if (isl7998x->enabled) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = isl7998x_set_standard(isl7998x, V4L2_STD_UNKNOWN);
+	if (ret)
+		goto out_unlock;
+
+	for (i = 0; i < ISL7998X_INPUTS; i++) {
+		ret = regmap_write(isl7998x->regmap,
+				   ISL7998X_REG_PX_DEC_SDTR(i + 1),
+				   ISL7998X_REG_PX_DEC_SDTR_ATSTART);
+		if (ret)
+			goto out_reset_std;
+	}
+
+	for (i = 0; i < ISL7998X_INPUTS; i++) {
+		ret = regmap_read_poll_timeout(isl7998x->regmap,
+					       ISL7998X_REG_PX_DEC_SDT(i + 1),
+					       reg,
+					       !(reg & ISL7998X_REG_PX_DEC_SDT_DET),
+					       2000, 500 * USEC_PER_MSEC);
+		if (ret)
+			goto out_reset_std;
+		std_id[i] = FIELD_GET(ISL7998X_REG_PX_DEC_SDT_NOW, reg);
+	}
+
+	/*
+	 * According to Renesas FAE, all input cameras must have the
+	 * same standard on this chip.
+	 */
+	for (i = 0; i < isl7998x->nr_inputs; i++) {
+		dev_dbg(dev, "input %d: detected %s\n",
+			i, v4l2_norm_to_name(isl7998x_std_res[std_id[i]].norm));
+		if (std_id[0] != std_id[i])
+			dev_warn(dev,
+				 "incompatible standards: %s on input %d (expected %s)\n",
+				 v4l2_norm_to_name(isl7998x_std_res[std_id[i]].norm), i,
+				 v4l2_norm_to_name(isl7998x_std_res[std_id[0]].norm));
+	}
+
+	*std = isl7998x_std_res[std_id[0]].norm;
+
+out_reset_std:
+	isl7998x_set_standard(isl7998x, isl7998x->norm);
+out_unlock:
+	mutex_unlock(&isl7998x->lock);
+	pm_runtime_put(dev);
+
+	return ret;
+}
+
+static int isl7998x_g_tvnorms(struct v4l2_subdev *sd, v4l2_std_id *std)
+{
+	*std = V4L2_STD_ALL;
+
+	return 0;
+}
+
+static int isl7998x_g_input_status(struct v4l2_subdev *sd, u32 *status)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+	unsigned int i;
+	int ret = 0;
+	u32 reg;
+
+	if (!pm_runtime_active(dev)) {
+		*status |= V4L2_IN_ST_NO_POWER;
+		return 0;
+	}
+
+	for (i = 0; i < isl7998x->nr_inputs; i++) {
+		ret = regmap_read(isl7998x->regmap,
+				  ISL7998X_REG_PX_DEC_STATUS_1(i + 1), &reg);
+		if (!ret) {
+			if (reg & ISL7998X_REG_PX_DEC_STATUS_1_VDLOSS)
+				*status |= V4L2_IN_ST_NO_SIGNAL;
+			if (!(reg & ISL7998X_REG_PX_DEC_STATUS_1_HLOCK))
+				*status |= V4L2_IN_ST_NO_H_LOCK;
+			if (!(reg & ISL7998X_REG_PX_DEC_STATUS_1_VLOCK))
+				*status |= V4L2_IN_ST_NO_V_LOCK;
+		}
+	}
+
+	return ret;
+}
+
+static int isl7998x_s_stream(struct v4l2_subdev *sd, int enable)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+	int ret = 0;
+	u32 reg;
+
+	dev_dbg(dev, "stream %s\n", enable ? "ON" : "OFF");
+
+	mutex_lock(&isl7998x->lock);
+	if (isl7998x->enabled == enable)
+		goto out;
+	isl7998x->enabled = enable;
+
+	if (enable) {
+		ret = isl7998x_set_test_pattern(isl7998x);
+		if (ret)
+			goto out;
+	}
+
+	regmap_read(isl7998x->regmap,
+		    ISL7998X_REG_P5_LI_ENGINE_CTL, &reg);
+	if (enable)
+		reg &= ~BIT(7);
+	else
+		reg |= BIT(7);
+	ret = regmap_write(isl7998x->regmap,
+			   ISL7998X_REG_P5_LI_ENGINE_CTL, reg);
+
+out:
+	mutex_unlock(&isl7998x->lock);
+
+	return ret;
+}
+
+static int isl7998x_pre_streamon(struct v4l2_subdev *sd, u32 flags)
+{
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+
+	return pm_runtime_resume_and_get(dev);
+}
+
+static int isl7998x_post_streamoff(struct v4l2_subdev *sd)
+{
+	struct i2c_client *client = v4l2_get_subdevdata(sd);
+	struct device *dev = &client->dev;
+
+	pm_runtime_put(dev);
+
+	return 0;
+}
+
+static int isl7998x_enum_mbus_code(struct v4l2_subdev *sd,
+				   struct v4l2_subdev_state *sd_state,
+				   struct v4l2_subdev_mbus_code_enum *code)
+{
+	if (code->index >= ARRAY_SIZE(isl7998x_colour_fmts))
+		return -EINVAL;
+
+	code->code = isl7998x_colour_fmts[code->index].code;
+
+	return 0;
+}
+
+static int isl7998x_enum_frame_size(struct v4l2_subdev *sd,
+				    struct v4l2_subdev_state *sd_state,
+				    struct v4l2_subdev_frame_size_enum *fse)
+{
+	if (fse->index >= ARRAY_SIZE(supported_modes))
+		return -EINVAL;
+
+	if (fse->code != isl7998x_colour_fmts[0].code)
+		return -EINVAL;
+
+	fse->min_width = supported_modes[fse->index].width;
+	fse->max_width = fse->min_width;
+	fse->min_height = supported_modes[fse->index].height;
+	fse->max_height = fse->min_height;
+
+	return 0;
+}
+
+static int isl7998x_get_fmt(struct v4l2_subdev *sd,
+			    struct v4l2_subdev_state *sd_state,
+			    struct v4l2_subdev_format *format)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct v4l2_mbus_framefmt *mf = &format->format;
+	const struct isl7998x_mode *mode;
+
+	mutex_lock(&isl7998x->lock);
+
+	if (format->which == V4L2_SUBDEV_FORMAT_TRY) {
+		format->format = *v4l2_subdev_get_try_format(sd, sd_state,
+							     format->pad);
+		goto out;
+	}
+
+	mode = isl7998x_norm_to_mode(isl7998x->norm);
+
+	mf->width	= mode->width;
+	mf->height	= mode->height;
+	mf->code	= isl7998x->fmt->code;
+	mf->field	= mode->field;
+	mf->colorspace	= 0;
+
+out:
+	mutex_unlock(&isl7998x->lock);
+
+	return 0;
+}
+
+static int isl7998x_set_fmt(struct v4l2_subdev *sd,
+			    struct v4l2_subdev_state *sd_state,
+			    struct v4l2_subdev_format *format)
+{
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	struct v4l2_mbus_framefmt *mf = &format->format;
+	const struct isl7998x_mode *mode;
+
+	mutex_lock(&isl7998x->lock);
+
+	mode = isl7998x_norm_to_mode(isl7998x->norm);
+
+	mf->width = mode->width;
+	mf->height = mode->height;
+	mf->code = isl7998x->fmt->code;
+	mf->field = mode->field;
+
+	if (format->which == V4L2_SUBDEV_FORMAT_TRY)
+		*v4l2_subdev_get_try_format(sd, sd_state, format->pad) = format->format;
+
+	mutex_unlock(&isl7998x->lock);
+
+	return 0;
+}
+
+static int isl7998x_set_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct isl7998x *isl7998x = container_of(ctrl->handler,
+						 struct isl7998x, ctrl_handler);
+	int ret = 0;
+
+	switch (ctrl->id) {
+	case V4L2_CID_TEST_PATTERN_BARS:
+		mutex_lock(&isl7998x->lock);
+		isl7998x->test_pattern_bars = ctrl->val & 0x3;
+		ret = isl7998x_set_test_pattern(isl7998x);
+		mutex_unlock(&isl7998x->lock);
+		break;
+	case V4L2_CID_TEST_PATTERN_CHANNELS:
+		mutex_lock(&isl7998x->lock);
+		isl7998x->test_pattern_chans = ctrl->val & 0xf;
+		ret = isl7998x_set_test_pattern(isl7998x);
+		mutex_unlock(&isl7998x->lock);
+		break;
+	case V4L2_CID_TEST_PATTERN_COLOR:
+		mutex_lock(&isl7998x->lock);
+		isl7998x->test_pattern_color = ctrl->val & 0x3;
+		ret = isl7998x_set_test_pattern(isl7998x);
+		mutex_unlock(&isl7998x->lock);
+		break;
+	case V4L2_CID_TEST_PATTERN:
+		mutex_lock(&isl7998x->lock);
+		isl7998x->test_pattern = ctrl->val;
+		ret = isl7998x_set_test_pattern(isl7998x);
+		mutex_unlock(&isl7998x->lock);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct v4l2_subdev_core_ops isl7998x_subdev_core_ops = {
+#ifdef CONFIG_VIDEO_ADV_DEBUG
+	.g_register	= isl7998x_g_register,
+	.s_register	= isl7998x_s_register,
+#endif
+};
+
+static const struct v4l2_subdev_video_ops isl7998x_subdev_video_ops = {
+	.g_std		= isl7998x_g_std,
+	.s_std		= isl7998x_s_std,
+	.querystd	= isl7998x_querystd,
+	.g_tvnorms	= isl7998x_g_tvnorms,
+	.g_input_status	= isl7998x_g_input_status,
+	.s_stream	= isl7998x_s_stream,
+	.pre_streamon	= isl7998x_pre_streamon,
+	.post_streamoff	= isl7998x_post_streamoff,
+};
+
+static const struct v4l2_subdev_pad_ops isl7998x_subdev_pad_ops = {
+	.enum_mbus_code		= isl7998x_enum_mbus_code,
+	.enum_frame_size	= isl7998x_enum_frame_size,
+	.get_fmt		= isl7998x_get_fmt,
+	.set_fmt		= isl7998x_set_fmt,
+};
+
+static const struct v4l2_subdev_ops isl7998x_subdev_ops = {
+	.core		= &isl7998x_subdev_core_ops,
+	.video		= &isl7998x_subdev_video_ops,
+	.pad		= &isl7998x_subdev_pad_ops,
+};
+
+static const struct media_entity_operations isl7998x_entity_ops = {
+	.link_validate = v4l2_subdev_link_validate,
+};
+
+static const struct v4l2_ctrl_ops isl7998x_ctrl_ops = {
+	.s_ctrl			= isl7998x_set_ctrl,
+};
+
+static const struct v4l2_ctrl_config isl7998x_ctrls[] = {
+	{
+		.ops		= &isl7998x_ctrl_ops,
+		.id		= V4L2_CID_TEST_PATTERN_BARS,
+		.type		= V4L2_CTRL_TYPE_MENU,
+		.name		= "Test Pattern Bars",
+		.max		= ARRAY_SIZE(isl7998x_test_pattern_bars) - 1,
+		.def		= 0,
+		.qmenu		= isl7998x_test_pattern_bars,
+	}, {
+		.ops		= &isl7998x_ctrl_ops,
+		.id		= V4L2_CID_TEST_PATTERN_CHANNELS,
+		.type		= V4L2_CTRL_TYPE_INTEGER,
+		.name		= "Test Pattern Channels",
+		.min		= 0,
+		.max		= 0xf,
+		.step		= 1,
+		.def		= 0xf,
+		.flags		= 0,
+	}, {
+		.ops		= &isl7998x_ctrl_ops,
+		.id		= V4L2_CID_TEST_PATTERN_COLOR,
+		.type		= V4L2_CTRL_TYPE_MENU,
+		.name		= "Test Pattern Color",
+		.max		= ARRAY_SIZE(isl7998x_test_pattern_colors) - 1,
+		.def		= 0,
+		.qmenu		= isl7998x_test_pattern_colors,
+	},
+};
+
+#define ISL7998X_REG_DECODER_ACA_READABLE_RANGE(page)			\
+	/* Decoder range */						\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_INPUT_FMT(page),		\
+			 ISL7998X_REG_PX_DEC_HS_DELAY_CTL(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_ANCTL(page),		\
+			 ISL7998X_REG_PX_DEC_CSC_CTL(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_BRIGHT(page),		\
+			 ISL7998X_REG_PX_DEC_HUE(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_VERT_PEAK(page),		\
+			 ISL7998X_REG_PX_DEC_CORING(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_SDT(page),			\
+			 ISL7998X_REG_PX_DEC_SDTR(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_CLMPG(page),		\
+			 ISL7998X_REG_PX_DEC_DATA_CONV(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_INTERNAL_TEST(page),	\
+			 ISL7998X_REG_PX_DEC_INTERNAL_TEST(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_H_DELAY_CTL(page),		\
+			 ISL7998X_REG_PX_DEC_H_DELAY_II_LOW(page)),	\
+	/* ACA range */							\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_CTL_1(page),		\
+			 ISL7998X_REG_PX_ACA_HIST_WIN_V_SZ2(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_Y_AVG(page),		\
+			 ISL7998X_REG_PX_ACA_CTL_4(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_FLEX_WIN_HIST(page),	\
+			 ISL7998X_REG_PX_ACA_XFER_HIST_HOST(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(page),		\
+			 ISL7998X_REG_PX_DEC_PAGE(page))
+
+#define ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(page)			\
+	/* Decoder range */						\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_INPUT_FMT(page),		\
+			 ISL7998X_REG_PX_DEC_INPUT_FMT(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_HS_DELAY_CTL(page),	\
+			 ISL7998X_REG_PX_DEC_HS_DELAY_CTL(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_ANCTL(page),		\
+			 ISL7998X_REG_PX_DEC_CSC_CTL(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_BRIGHT(page),		\
+			 ISL7998X_REG_PX_DEC_HUE(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_VERT_PEAK(page),		\
+			 ISL7998X_REG_PX_DEC_CORING(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_SDT(page),			\
+			 ISL7998X_REG_PX_DEC_SDTR(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_CLMPG(page),		\
+			 ISL7998X_REG_PX_DEC_MISC3(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_CLMD(page),		\
+			 ISL7998X_REG_PX_DEC_DATA_CONV(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_INTERNAL_TEST(page),	\
+			 ISL7998X_REG_PX_DEC_INTERNAL_TEST(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_H_DELAY_CTL(page),		\
+			 ISL7998X_REG_PX_DEC_H_DELAY_II_LOW(page)),	\
+	/* ACA range */							\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_CTL_1(page),		\
+			 ISL7998X_REG_PX_ACA_HIST_WIN_V_SZ2(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_CTL_2(page),		\
+			 ISL7998X_REG_PX_ACA_CTL_4(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_FLEX_WIN_HIST(page),	\
+			 ISL7998X_REG_PX_ACA_HIST_DATA_LO(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_XFER_HIST_HOST(page),	\
+			 ISL7998X_REG_PX_ACA_XFER_HIST_HOST(page)),	\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(page),		\
+			 ISL7998X_REG_PX_DEC_PAGE(page))
+
+#define ISL7998X_REG_DECODER_ACA_VOLATILE_RANGE(page)			\
+	/* Decoder range */						\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_STATUS_1(page),		\
+			 ISL7998X_REG_PX_DEC_STATUS_1(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_SDT(page),			\
+			 ISL7998X_REG_PX_DEC_SDT(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_DEC_MVSN(page),		\
+			 ISL7998X_REG_PX_DEC_HFREF(page)),		\
+	/* ACA range */							\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_Y_AVG(page),		\
+			 ISL7998X_REG_PX_ACA_Y_HIGH(page)),		\
+	regmap_reg_range(ISL7998X_REG_PX_ACA_HIST_DATA_LO(page),	\
+			 ISL7998X_REG_PX_ACA_FLEX_WIN_CR_CLR(page))
+
+static const struct regmap_range isl7998x_readable_ranges[] = {
+	regmap_reg_range(ISL7998X_REG_P0_PRODUCT_ID_CODE,
+			 ISL7998X_REG_P0_IRQ_SYNC_CTL),
+	regmap_reg_range(ISL7998X_REG_P0_INTERRUPT_STATUS,
+			 ISL7998X_REG_P0_CLOCK_DELAY),
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(0),
+			 ISL7998X_REG_PX_DEC_PAGE(0)),
+
+	ISL7998X_REG_DECODER_ACA_READABLE_RANGE(1),
+	ISL7998X_REG_DECODER_ACA_READABLE_RANGE(2),
+	ISL7998X_REG_DECODER_ACA_READABLE_RANGE(3),
+	ISL7998X_REG_DECODER_ACA_READABLE_RANGE(4),
+
+	regmap_reg_range(ISL7998X_REG_P5_LI_ENGINE_CTL,
+			 ISL7998X_REG_P5_MIPI_SP_HS_TRL_CTL),
+	regmap_reg_range(ISL7998X_REG_P5_FIFO_THRSH_CNT_1,
+			 ISL7998X_REG_P5_PLL_ANA),
+	regmap_reg_range(ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_1,
+			 ISL7998X_REG_P5_HIST_LINE_CNT_2),
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(5),
+			 ISL7998X_REG_PX_DEC_PAGE(5)),
+};
+
+static const struct regmap_range isl7998x_writeable_ranges[] = {
+	regmap_reg_range(ISL7998X_REG_P0_SW_RESET_CTL,
+			 ISL7998X_REG_P0_IRQ_SYNC_CTL),
+	regmap_reg_range(ISL7998X_REG_P0_CHAN_1_IRQ,
+			 ISL7998X_REG_P0_SHORT_DIAG_IRQ_EN),
+	regmap_reg_range(ISL7998X_REG_P0_CLOCK_DELAY,
+			 ISL7998X_REG_P0_CLOCK_DELAY),
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(0),
+			 ISL7998X_REG_PX_DEC_PAGE(0)),
+
+	ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(1),
+	ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(2),
+	ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(3),
+	ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(4),
+
+	regmap_reg_range(ISL7998X_REG_P5_LI_ENGINE_CTL,
+			 ISL7998X_REG_P5_ESC_MODE_TIME_CTL),
+	regmap_reg_range(ISL7998X_REG_P5_MIPI_SP_HS_TRL_CTL,
+			 ISL7998X_REG_P5_PLL_ANA),
+	regmap_reg_range(ISL7998X_REG_P5_TOTAL_PF_LINE_CNT_1,
+			 ISL7998X_REG_P5_HIST_LINE_CNT_2),
+	regmap_reg_range(ISL7998X_REG_PX_DEC_PAGE(5),
+			 ISL7998X_REG_PX_DEC_PAGE(5)),
+
+	ISL7998X_REG_DECODER_ACA_WRITEABLE_RANGE(0xf),
+};
+
+static const struct regmap_range isl7998x_volatile_ranges[] = {
+	/* Product id code register is used to check availability */
+	regmap_reg_range(ISL7998X_REG_P0_PRODUCT_ID_CODE,
+			 ISL7998X_REG_P0_PRODUCT_ID_CODE),
+	regmap_reg_range(ISL7998X_REG_P0_MPP1_SYNC_CTL,
+			 ISL7998X_REG_P0_IRQ_SYNC_CTL),
+	regmap_reg_range(ISL7998X_REG_P0_INTERRUPT_STATUS,
+			 ISL7998X_REG_P0_INTERRUPT_STATUS),
+	regmap_reg_range(ISL7998X_REG_P0_CHAN_1_STATUS,
+			 ISL7998X_REG_P0_SHORT_DIAG_STATUS),
+
+	ISL7998X_REG_DECODER_ACA_VOLATILE_RANGE(1),
+	ISL7998X_REG_DECODER_ACA_VOLATILE_RANGE(2),
+	ISL7998X_REG_DECODER_ACA_VOLATILE_RANGE(3),
+	ISL7998X_REG_DECODER_ACA_VOLATILE_RANGE(4),
+
+	regmap_reg_range(ISL7998X_REG_P5_AUTO_TEST_ERR_DET,
+			 ISL7998X_REG_P5_PIC_HEIGHT_LOW),
+};
+
+static const struct regmap_access_table isl7998x_readable_table = {
+	.yes_ranges = isl7998x_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(isl7998x_readable_ranges),
+};
+
+static const struct regmap_access_table isl7998x_writeable_table = {
+	.yes_ranges = isl7998x_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(isl7998x_writeable_ranges),
+};
+
+static const struct regmap_access_table isl7998x_volatile_table = {
+	.yes_ranges = isl7998x_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(isl7998x_volatile_ranges),
+};
+
+static const struct regmap_range_cfg isl7998x_ranges[] = {
+	{
+		.range_min	= ISL7998X_REG_PN_BASE(0),
+		.range_max	= ISL7998X_REG_PX_ACA_XFER_HIST_HOST(0xf),
+		.selector_reg	= ISL7998X_REG_PX_DEC_PAGE(0),
+		.selector_mask	= ISL7998X_REG_PX_DEC_PAGE_MASK,
+		.window_start	= 0,
+		.window_len	= 256,
+	}
+};
+
+static const struct regmap_config isl7998x_regmap = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.max_register	= ISL7998X_REG_PX_ACA_XFER_HIST_HOST(0xf),
+	.ranges		= isl7998x_ranges,
+	.num_ranges	= ARRAY_SIZE(isl7998x_ranges),
+	.rd_table	= &isl7998x_readable_table,
+	.wr_table	= &isl7998x_writeable_table,
+	.volatile_table	= &isl7998x_volatile_table,
+	.cache_type	= REGCACHE_RBTREE,
+};
+
+static int isl7998x_mc_init(struct isl7998x *isl7998x)
+{
+	unsigned int i;
+
+	isl7998x->subdev.entity.ops = &isl7998x_entity_ops;
+	isl7998x->subdev.entity.function = MEDIA_ENT_F_VID_IF_BRIDGE;
+
+	isl7998x->pads[ISL7998X_PAD_OUT].flags = MEDIA_PAD_FL_SOURCE;
+	for (i = ISL7998X_PAD_VIN1; i < ISL7998X_NUM_PADS; i++)
+		isl7998x->pads[i].flags = MEDIA_PAD_FL_SINK;
+
+	return media_entity_pads_init(&isl7998x->subdev.entity,
+				      ISL7998X_NUM_PADS,
+				      isl7998x->pads);
+}
+
+static int get_link_freq_menu_index(unsigned int lanes,
+				    unsigned int inputs)
+{
+	int ret = -EINVAL;
+
+	switch (lanes) {
+	case 1:
+		if (inputs == 1)
+			ret = 0;
+		if (inputs == 2)
+			ret = 1;
+		if (inputs == 4)
+			ret = 2;
+		break;
+	case 2:
+		if (inputs == 2)
+			ret = 0;
+		if (inputs == 4)
+			ret = 1;
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void isl7998x_remove_controls(struct isl7998x *isl7998x)
+{
+	v4l2_ctrl_handler_free(&isl7998x->ctrl_handler);
+	mutex_destroy(&isl7998x->ctrl_mutex);
+}
+
+static int isl7998x_init_controls(struct isl7998x *isl7998x)
+{
+	struct v4l2_subdev *sd = &isl7998x->subdev;
+	int link_freq_index;
+	unsigned int i;
+	int ret;
+
+	ret = v4l2_ctrl_handler_init(&isl7998x->ctrl_handler,
+				     2 + ARRAY_SIZE(isl7998x_ctrls));
+	if (ret)
+		return ret;
+
+	mutex_init(&isl7998x->ctrl_mutex);
+	isl7998x->ctrl_handler.lock = &isl7998x->ctrl_mutex;
+	link_freq_index = get_link_freq_menu_index(isl7998x->nr_mipi_lanes,
+						   isl7998x->nr_inputs);
+	if (link_freq_index < 0 ||
+	    link_freq_index >= ARRAY_SIZE(link_freq_menu_items)) {
+		dev_err(sd->dev,
+			"failed to find MIPI link freq: %d lanes, %d inputs\n",
+			isl7998x->nr_mipi_lanes, isl7998x->nr_inputs);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	isl7998x->link_freq = v4l2_ctrl_new_int_menu(&isl7998x->ctrl_handler,
+						     &isl7998x_ctrl_ops,
+						     V4L2_CID_LINK_FREQ,
+						     ARRAY_SIZE(link_freq_menu_items) - 1,
+						     link_freq_index,
+						     link_freq_menu_items);
+	if (isl7998x->link_freq)
+		isl7998x->link_freq->flags |= V4L2_CTRL_FLAG_READ_ONLY;
+
+	for (i = 0; i < ARRAY_SIZE(isl7998x_ctrls); i++)
+		v4l2_ctrl_new_custom(&isl7998x->ctrl_handler,
+				     &isl7998x_ctrls[i], NULL);
+
+	v4l2_ctrl_new_std_menu_items(&isl7998x->ctrl_handler,
+				     &isl7998x_ctrl_ops,
+				     V4L2_CID_TEST_PATTERN,
+				     ARRAY_SIZE(isl7998x_test_pattern_menu) - 1,
+				     0, 0, isl7998x_test_pattern_menu);
+
+	ret = isl7998x->ctrl_handler.error;
+	if (ret)
+		goto err;
+
+	isl7998x->subdev.ctrl_handler = &isl7998x->ctrl_handler;
+	v4l2_ctrl_handler_setup(&isl7998x->ctrl_handler);
+
+	return 0;
+
+err:
+	isl7998x_remove_controls(isl7998x);
+
+	return ret;
+}
+
+static int isl7998x_probe(struct i2c_client *client)
+{
+	struct device *dev = &client->dev;
+	struct v4l2_fwnode_endpoint endpoint = {
+		.bus_type = V4L2_MBUS_CSI2_DPHY,
+	};
+	struct fwnode_handle *ep;
+	struct isl7998x *isl7998x;
+	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	int nr_inputs;
+	int ret;
+
+	ret = i2c_check_functionality(adapter, I2C_FUNC_SMBUS_WORD_DATA);
+	if (!ret) {
+		dev_warn(&adapter->dev,
+			 "I2C-Adapter doesn't support I2C_FUNC_SMBUS_WORD\n");
+		return -EIO;
+	}
+
+	isl7998x = devm_kzalloc(dev, sizeof(*isl7998x), GFP_KERNEL);
+	if (!isl7998x)
+		return -ENOMEM;
+
+	isl7998x->pd_gpio = devm_gpiod_get_optional(dev, "powerdown",
+						    GPIOD_OUT_HIGH);
+	if (IS_ERR(isl7998x->pd_gpio))
+		return dev_err_probe(dev, PTR_ERR(isl7998x->pd_gpio),
+				     "Failed to retrieve/request PD GPIO\n");
+
+	isl7998x->rstb_gpio = devm_gpiod_get_optional(dev, "reset",
+						      GPIOD_OUT_HIGH);
+	if (IS_ERR(isl7998x->rstb_gpio))
+		return dev_err_probe(dev, PTR_ERR(isl7998x->rstb_gpio),
+				     "Failed to retrieve/request RSTB GPIO\n");
+
+	isl7998x->regmap = devm_regmap_init_i2c(client, &isl7998x_regmap);
+	if (IS_ERR(isl7998x->regmap))
+		return dev_err_probe(dev, PTR_ERR(isl7998x->regmap),
+				     "Failed to allocate register map\n");
+
+	ep = fwnode_graph_get_endpoint_by_id(dev_fwnode(dev),
+					     ISL7998X_PAD_OUT, 0, 0);
+	if (!ep)
+		return dev_err_probe(dev, -EINVAL, "Missing endpoint node\n");
+
+	ret = v4l2_fwnode_endpoint_parse(ep, &endpoint);
+	fwnode_handle_put(ep);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to parse endpoint\n");
+
+	if (endpoint.bus.mipi_csi2.num_data_lanes == 0 ||
+	    endpoint.bus.mipi_csi2.num_data_lanes > 2)
+		return dev_err_probe(dev, -EINVAL,
+				     "Invalid number of MIPI lanes\n");
+
+	isl7998x->nr_mipi_lanes = endpoint.bus.mipi_csi2.num_data_lanes;
+
+	nr_inputs = isl7998x_get_nr_inputs(dev->of_node);
+	if (nr_inputs < 0)
+		return dev_err_probe(dev, nr_inputs,
+				     "Invalid number of input ports\n");
+	isl7998x->nr_inputs = nr_inputs;
+
+	v4l2_i2c_subdev_init(&isl7998x->subdev, client, &isl7998x_subdev_ops);
+	isl7998x->subdev.flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
+
+	ret = isl7998x_mc_init(isl7998x);
+	if (ret < 0)
+		return ret;
+
+	isl7998x->fmt = &isl7998x_colour_fmts[0];
+	isl7998x->norm = V4L2_STD_NTSC;
+	isl7998x->enabled = 0;
+
+	mutex_init(&isl7998x->lock);
+
+	ret = isl7998x_init_controls(isl7998x);
+	if (ret)
+		goto err_entity_cleanup;
+
+	ret = v4l2_async_register_subdev(&isl7998x->subdev);
+	if (ret < 0)
+		goto err_controls_cleanup;
+
+	pm_runtime_enable(dev);
+
+	return 0;
+
+err_controls_cleanup:
+	isl7998x_remove_controls(isl7998x);
+err_entity_cleanup:
+	media_entity_cleanup(&isl7998x->subdev.entity);
+
+	return ret;
+}
+
+static int isl7998x_remove(struct i2c_client *client)
+{
+	struct isl7998x *isl7998x = i2c_to_isl7998x(client);
+
+	pm_runtime_disable(&client->dev);
+	v4l2_async_unregister_subdev(&isl7998x->subdev);
+	isl7998x_remove_controls(isl7998x);
+	media_entity_cleanup(&isl7998x->subdev.entity);
+
+	return 0;
+}
+
+static const struct of_device_id isl7998x_of_match[] = {
+	{ .compatible = "isil,isl79987", },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, isl7998x_of_match);
+
+static const struct i2c_device_id isl7998x_id[] = {
+	{ "isl79987", 0 },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(i2c, isl7998x_id);
+
+static int __maybe_unused isl7998x_runtime_resume(struct device *dev)
+{
+	struct v4l2_subdev *sd = dev_get_drvdata(dev);
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+	int ret;
+
+	gpiod_set_value(isl7998x->rstb_gpio, 1);
+	gpiod_set_value(isl7998x->pd_gpio, 0);
+	gpiod_set_value(isl7998x->rstb_gpio, 0);
+
+	ret = isl7998x_wait_power_on(isl7998x);
+	if (ret)
+		goto err;
+
+	ret = isl7998x_init(isl7998x);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	gpiod_set_value(isl7998x->pd_gpio, 1);
+
+	return ret;
+}
+
+static int __maybe_unused isl7998x_runtime_suspend(struct device *dev)
+{
+	struct v4l2_subdev *sd = dev_get_drvdata(dev);
+	struct isl7998x *isl7998x = sd_to_isl7998x(sd);
+
+	gpiod_set_value(isl7998x->pd_gpio, 1);
+
+	return 0;
+}
+
+static const struct dev_pm_ops isl7998x_pm_ops = {
+	SET_RUNTIME_PM_OPS(isl7998x_runtime_suspend,
+			   isl7998x_runtime_resume,
+			   NULL)
+};
+
+static struct i2c_driver isl7998x_i2c_driver = {
+	.driver = {
+		.name = "isl7998x",
+		.of_match_table = of_match_ptr(isl7998x_of_match),
+		.pm = &isl7998x_pm_ops,
+	},
+	.probe_new	= isl7998x_probe,
+	.remove		= isl7998x_remove,
+	.id_table	= isl7998x_id,
+};
+
+module_i2c_driver(isl7998x_i2c_driver);
+
+MODULE_DESCRIPTION("Intersil ISL7998x Analog to MIPI CSI-2/BT656 decoder");
+MODULE_AUTHOR("Marek Vasut <marex@denx.de>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index c8e0f84d204d..92576ed03fc4 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -219,6 +219,12 @@ enum v4l2_colorfx {
  */
 #define V4L2_CID_USER_ALLEGRO_BASE		(V4L2_CID_USER_BASE + 0x1170)
 
+/*
+ * The base for the isl7998x driver controls.
+ * We reserve 16 controls for this driver.
+ */
+#define V4L2_CID_USER_ISL7998X_BASE		(V4L2_CID_USER_BASE + 0x1180)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From bfa26ba343c727e055223be04e08f2ebdd43c293 Mon Sep 17 00:00:00 2001
From: William Mahon <wmahon@chromium.org>
Date: Thu, 3 Mar 2022 18:23:42 -0800
Subject: HID: add mapping for KEY_DICTATE

Numerous keyboards are adding dictate keys which allows for text
messages to be dictated by a microphone.

This patch adds a new key definition KEY_DICTATE and maps 0x0c/0x0d8
usage code to this new keycode. Additionally hid-debug is adjusted to
recognize this new usage code as well.

Signed-off-by: William Mahon <wmahon@chromium.org>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220303021501.1.I5dbf50eb1a7a6734ee727bda4a8573358c6d3ec0@changeid
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/hid/hid-debug.c                | 1 +
 drivers/hid/hid-input.c                | 1 +
 include/uapi/linux/input-event-codes.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 26c31d759914..8aa68416b1d7 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -969,6 +969,7 @@ static const char *keys[KEY_MAX + 1] = {
 	[KEY_ASSISTANT] = "Assistant",
 	[KEY_KBD_LAYOUT_NEXT] = "KbdLayoutNext",
 	[KEY_EMOJI_PICKER] = "EmojiPicker",
+	[KEY_DICTATE] = "Dictate",
 	[KEY_BRIGHTNESS_MIN] = "BrightnessMin",
 	[KEY_BRIGHTNESS_MAX] = "BrightnessMax",
 	[KEY_BRIGHTNESS_AUTO] = "BrightnessAuto",
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 112901d2d8d2..ce2b75a67cb8 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -992,6 +992,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 		case 0x0cd: map_key_clear(KEY_PLAYPAUSE);	break;
 		case 0x0cf: map_key_clear(KEY_VOICECOMMAND);	break;
 
+		case 0x0d8: map_key_clear(KEY_DICTATE);		break;
 		case 0x0d9: map_key_clear(KEY_EMOJI_PICKER);	break;
 
 		case 0x0e0: map_abs_clear(ABS_VOLUME);		break;
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 225ec87d4f22..4db5d41848e4 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -612,6 +612,7 @@
 #define KEY_ASSISTANT		0x247	/* AL Context-aware desktop assistant */
 #define KEY_KBD_LAYOUT_NEXT	0x248	/* AC Next Keyboard Layout Select */
 #define KEY_EMOJI_PICKER	0x249	/* Show/hide emoji picker (HUTRR101) */
+#define KEY_DICTATE		0x24a	/* Start or Stop Voice Dictation Session (HUTRR99) */
 
 #define KEY_BRIGHTNESS_MIN		0x250	/* Set Brightness to Minimum */
 #define KEY_BRIGHTNESS_MAX		0x251	/* Set Brightness to Maximum */
-- 
cgit v1.2.3


From 327b89f0acc4c20a06ed59e4d9af7f6d804dc2e2 Mon Sep 17 00:00:00 2001
From: William Mahon <wmahon@chromium.org>
Date: Thu, 3 Mar 2022 18:26:22 -0800
Subject: HID: add mapping for KEY_ALL_APPLICATIONS

This patch adds a new key definition for KEY_ALL_APPLICATIONS
and aliases KEY_DASHBOARD to it.

It also maps the 0x0c/0x2a2 usage code to KEY_ALL_APPLICATIONS.

Signed-off-by: William Mahon <wmahon@chromium.org>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220303035618.1.I3a7746ad05d270161a18334ae06e3b6db1a1d339@changeid
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/hid/hid-debug.c                | 4 +++-
 drivers/hid/hid-input.c                | 2 ++
 include/uapi/linux/input-event-codes.h | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 8aa68416b1d7..81e7e404a5fc 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -860,7 +860,9 @@ static const char *keys[KEY_MAX + 1] = {
 	[KEY_F22] = "F22",			[KEY_F23] = "F23",
 	[KEY_F24] = "F24",			[KEY_PLAYCD] = "PlayCD",
 	[KEY_PAUSECD] = "PauseCD",		[KEY_PROG3] = "Prog3",
-	[KEY_PROG4] = "Prog4",			[KEY_SUSPEND] = "Suspend",
+	[KEY_PROG4] = "Prog4",
+	[KEY_ALL_APPLICATIONS] = "AllApplications",
+	[KEY_SUSPEND] = "Suspend",
 	[KEY_CLOSE] = "Close",			[KEY_PLAY] = "Play",
 	[KEY_FASTFORWARD] = "FastForward",	[KEY_BASSBOOST] = "BassBoost",
 	[KEY_PRINT] = "Print",			[KEY_HP] = "HP",
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index ce2b75a67cb8..56ec27398a00 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1084,6 +1084,8 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 
 		case 0x29d: map_key_clear(KEY_KBD_LAYOUT_NEXT);	break;
 
+		case 0x2a2: map_key_clear(KEY_ALL_APPLICATIONS);	break;
+
 		case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV);		break;
 		case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT);		break;
 		case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP);		break;
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 4db5d41848e4..7989d9483ea7 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -278,7 +278,8 @@
 #define KEY_PAUSECD		201
 #define KEY_PROG3		202
 #define KEY_PROG4		203
-#define KEY_DASHBOARD		204	/* AL Dashboard */
+#define KEY_ALL_APPLICATIONS	204	/* AC Desktop Show All Applications */
+#define KEY_DASHBOARD		KEY_ALL_APPLICATIONS
 #define KEY_SUSPEND		205
 #define KEY_CLOSE		206	/* AC Close */
 #define KEY_PLAY		207
-- 
cgit v1.2.3


From 96ba61ee5331eb6e2f4c2baeb994b9ceb01d8266 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sun, 20 Feb 2022 21:46:16 +0100
Subject: media: v4l2-ctrls: Add new V4L2_H264_DECODE_PARAM_FLAG_P/BFRAME flags

Add new V4L2_H264_DECODE_PARAM_FLAG_P/BFRAME flags that are needed by
NVIDIA Tegra video decoder. Userspace will have to set these flags in
accordance to the type of a decoded frame.

Reviewed-by: Nicolas Dufresne <nicolas@collabora.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst | 6 ++++++
 include/uapi/linux/v4l2-controls.h                                  | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
index cc080c4257d0..f87584ad90ba 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
@@ -616,6 +616,12 @@ Stateless Codec Control ID
     * - ``V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD``
       - 0x00000004
       -
+    * - ``V4L2_H264_DECODE_PARAM_FLAG_PFRAME``
+      - 0x00000008
+      -
+    * - ``V4L2_H264_DECODE_PARAM_FLAG_BFRAME``
+      - 0x00000010
+      -
 
 .. raw:: latex
 
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index c8e0f84d204d..e3d48d571062 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1563,6 +1563,8 @@ struct v4l2_h264_dpb_entry {
 #define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC		0x01
 #define V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC		0x02
 #define V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD	0x04
+#define V4L2_H264_DECODE_PARAM_FLAG_PFRAME		0x08
+#define V4L2_H264_DECODE_PARAM_FLAG_BFRAME		0x10
 
 #define V4L2_CID_STATELESS_H264_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 7)
 /**
-- 
cgit v1.2.3


From 1092347165cf5ed1453c1f211641a859818a2828 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Wed, 9 Feb 2022 17:03:12 +0100
Subject: media: lirc: remove unused feature LIRC_CAN_SET_REC_DUTY_CYCLE

There is no hardware which can filter input on the duty cycle, so no
driver implements this. On top of that, LIRC_CAN_SET_REC_DUTY_CYCLE
has the same value as LIRC_CAN_MEASURE_CARRIER (0x02000000).

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/lirc.h.rst.exceptions | 1 -
 include/uapi/linux/lirc.h                               | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/lirc.h.rst.exceptions b/Documentation/userspace-api/media/lirc.h.rst.exceptions
index 5f31e967bc50..913d17b49831 100644
--- a/Documentation/userspace-api/media/lirc.h.rst.exceptions
+++ b/Documentation/userspace-api/media/lirc.h.rst.exceptions
@@ -30,7 +30,6 @@ ignore define LIRC_CAN_REC
 
 ignore define LIRC_CAN_SEND_MASK
 ignore define LIRC_CAN_REC_MASK
-ignore define LIRC_CAN_SET_REC_DUTY_CYCLE
 
 # Obsolete ioctls
 
diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 21c69a6a100d..23b0f2c8ba81 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -73,7 +73,6 @@
 #define LIRC_CAN_REC_MASK              LIRC_MODE2REC(LIRC_CAN_SEND_MASK)
 
 #define LIRC_CAN_SET_REC_CARRIER       (LIRC_CAN_SET_SEND_CARRIER << 16)
-#define LIRC_CAN_SET_REC_DUTY_CYCLE    (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16)
 
 #define LIRC_CAN_SET_REC_CARRIER_RANGE    0x80000000
 #define LIRC_CAN_GET_REC_RESOLUTION       0x20000000
-- 
cgit v1.2.3


From 72a74c8f0a0df12c7d7ea012aa70d95152858dea Mon Sep 17 00:00:00 2001
From: Ming Qian <ming.qian@nxp.com>
Date: Thu, 24 Feb 2022 11:10:00 +0800
Subject: media: add nv12m_8l128 and nv12m_10be_8l128 video format.

nv12m_8l128 is 8-bit tiled nv12 format used by amphion decoder.
nv12m_10be_8l128 is 10-bit tiled format used by amphion decoder.
The tile size is 8x128

Signed-off-by: Ming Qian <ming.qian@nxp.com>
Signed-off-by: Shijie Qin <shijie.qin@nxp.com>
Signed-off-by: Zhou Peng <eagle.zhou@nxp.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst  | 28 +++++++++++++++++++---
 drivers/media/v4l2-core/v4l2-ioctl.c               |  2 ++
 include/uapi/linux/videodev2.h                     |  2 ++
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index 9a969f662595..cc3e4b5791c5 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -257,6 +257,8 @@ of the luma plane.
 .. _V4L2-PIX-FMT-NV12-4L4:
 .. _V4L2-PIX-FMT-NV12-16L16:
 .. _V4L2-PIX-FMT-NV12-32L32:
+.. _V4L2_PIX_FMT_NV12M_8L128:
+.. _V4L2_PIX_FMT_NV12M_10BE_8L128:
 
 Tiled NV12
 ----------
@@ -281,21 +283,41 @@ If the vertical resolution is an odd number of tiles, the last row of
 tiles is stored in linear order. The layouts of the luma and chroma
 planes are identical.
 
-``V4L2_PIX_FMT_NV12_4L4`` stores pixel in 4x4 tiles, and stores
+``V4L2_PIX_FMT_NV12_4L4`` stores pixels in 4x4 tiles, and stores
 tiles linearly in memory. The line stride and image height must be
 aligned to a multiple of 4. The layouts of the luma and chroma planes are
 identical.
 
-``V4L2_PIX_FMT_NV12_16L16`` stores pixel in 16x16 tiles, and stores
+``V4L2_PIX_FMT_NV12_16L16`` stores pixels in 16x16 tiles, and stores
 tiles linearly in memory. The line stride and image height must be
 aligned to a multiple of 16. The layouts of the luma and chroma planes are
 identical.
 
-``V4L2_PIX_FMT_NV12_32L32`` stores pixel in 32x32 tiles, and stores
+``V4L2_PIX_FMT_NV12_32L32`` stores pixels in 32x32 tiles, and stores
 tiles linearly in memory. The line stride and image height must be
 aligned to a multiple of 32. The layouts of the luma and chroma planes are
 identical.
 
+``V4L2_PIX_FMT_NV12M_8L128`` is similar to ``V4L2_PIX_FMT_NV12M`` but stores
+pixels in 2D 8x128 tiles, and stores tiles linearly in memory.
+The image height must be aligned to a multiple of 128.
+The layouts of the luma and chroma planes are identical.
+
+``V4L2_PIX_FMT_NV12M_10BE_8L128`` is similar to ``V4L2_PIX_FMT_NV12M`` but stores
+10 bits pixels in 2D 8x128 tiles, and stores tiles linearly in memory.
+the data is arranged in big endian order.
+The image height must be aligned to a multiple of 128.
+The layouts of the luma and chroma planes are identical.
+Note the tile size is 8bytes multiplied by 128 bytes,
+it means that the low bits and high bits of one pixel may be in different tiles.
+The 10 bit pixels are packed, so 5 bytes contain 4 10-bit pixels layout like
+this (for luma):
+byte 0: Y0(bits 9-2)
+byte 1: Y0(bits 1-0) Y1(bits 9-4)
+byte 2: Y1(bits 3-0) Y2(bits 9-6)
+byte 3: Y2(bits 5-0) Y3(bits 9-8)
+byte 4: Y3(bits 7-0)
+
 ``V4L2_PIX_FMT_MM21`` store luma pixel in 16x32 tiles, and chroma pixels
 in 16x16 tiles. The line stride must be aligned to a multiple of 16 and the
 image height must be aligned to a multiple of 32. The number of luma and chroma
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 642cb90f457c..96e307fe3aab 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1390,6 +1390,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_META_FMT_VIVID:       descr = "Vivid Metadata"; break;
 	case V4L2_META_FMT_RK_ISP1_PARAMS:	descr = "Rockchip ISP1 3A Parameters"; break;
 	case V4L2_META_FMT_RK_ISP1_STAT_3A:	descr = "Rockchip ISP1 3A Statistics"; break;
+	case V4L2_PIX_FMT_NV12M_8L128:	descr = "NV12M (8x128 Linear)"; break;
+	case V4L2_PIX_FMT_NV12M_10BE_8L128:	descr = "10-bit NV12M (8x128 Linear, BE)"; break;
 
 	default:
 		/* Compressed formats */
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index df8b9c486ba1..3768a0a80830 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -632,6 +632,8 @@ struct v4l2_pix_format {
 /* Tiled YUV formats, non contiguous planes */
 #define V4L2_PIX_FMT_NV12MT  v4l2_fourcc('T', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 64x32 tiles */
 #define V4L2_PIX_FMT_NV12MT_16X16 v4l2_fourcc('V', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 16x16 tiles */
+#define V4L2_PIX_FMT_NV12M_8L128      v4l2_fourcc('N', 'A', '1', '2') /* Y/CbCr 4:2:0 8x128 tiles */
+#define V4L2_PIX_FMT_NV12M_10BE_8L128 v4l2_fourcc_be('N', 'T', '1', '2') /* Y/CbCr 4:2:0 10-bit 8x128 tiles */
 
 /* Bayer formats - see http://www.siliconimaging.com/RGB%20Bayer.htm */
 #define V4L2_PIX_FMT_SBGGR8  v4l2_fourcc('B', 'A', '8', '1') /*  8  BGBG.. GRGR.. */
-- 
cgit v1.2.3


From d045b9eb95a9b611c483897a69e7285aefdc66d7 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 7 Mar 2022 12:44:36 -0800
Subject: mptcp: introduce implicit endpoints

In some edge scenarios, an MPTCP subflows can use a local address
mapped by a "implicit" endpoint created by the in-kernel path manager.

Such endpoints presence can be confusing, as it's creation is hard
to track and will prevent the later endpoint creation from the user-space
using the same address.

Define a new endpoint flag to mark implicit endpoints and allow the
user-space to replace implicit them with user-provided data at endpoint
creation time.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h                      |  1 +
 net/mptcp/pm_netlink.c                          | 61 ++++++++++++++++++-------
 tools/testing/selftests/net/mptcp/mptcp_join.sh |  4 +-
 3 files changed, 47 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index f106a3941cdf..9690efedb5fa 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -81,6 +81,7 @@ enum {
 #define MPTCP_PM_ADDR_FLAG_SUBFLOW			(1 << 1)
 #define MPTCP_PM_ADDR_FLAG_BACKUP			(1 << 2)
 #define MPTCP_PM_ADDR_FLAG_FULLMESH			(1 << 3)
+#define MPTCP_PM_ADDR_FLAG_IMPLICIT			(1 << 4)
 
 enum {
 	MPTCP_PM_CMD_UNSPEC,
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 91b77d1162cf..10368a4f1c4a 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -877,10 +877,18 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry)
 		MPTCP_PM_ADDR_FLAG_SIGNAL;
 }
 
+/* caller must ensure the RCU grace period is already elapsed */
+static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+	if (entry->lsk)
+		sock_release(entry->lsk);
+	kfree(entry);
+}
+
 static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 					     struct mptcp_pm_addr_entry *entry)
 {
-	struct mptcp_pm_addr_entry *cur;
+	struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
 	unsigned int addr_max;
 	int ret = -EINVAL;
 
@@ -901,8 +909,22 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 	list_for_each_entry(cur, &pernet->local_addr_list, list) {
 		if (addresses_equal(&cur->addr, &entry->addr,
 				    address_use_port(entry) &&
-				    address_use_port(cur)))
-			goto out;
+				    address_use_port(cur))) {
+			/* allow replacing the exiting endpoint only if such
+			 * endpoint is an implicit one and the user-space
+			 * did not provide an endpoint id
+			 */
+			if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT))
+				goto out;
+			if (entry->addr.id)
+				goto out;
+
+			pernet->addrs--;
+			entry->addr.id = cur->addr.id;
+			list_del_rcu(&cur->list);
+			del_entry = cur;
+			break;
+		}
 	}
 
 	if (!entry->addr.id) {
@@ -938,6 +960,12 @@ find_next:
 
 out:
 	spin_unlock_bh(&pernet->lock);
+
+	/* just replaced an existing entry, free it */
+	if (del_entry) {
+		synchronize_rcu();
+		__mptcp_pm_release_addr_entry(del_entry);
+	}
 	return ret;
 }
 
@@ -1036,7 +1064,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
 	entry->addr.id = 0;
 	entry->addr.port = 0;
 	entry->ifindex = 0;
-	entry->flags = 0;
+	entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
 	entry->lsk = NULL;
 	ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
 	if (ret < 0)
@@ -1249,6 +1277,11 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
+	if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
+		GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint");
+		return -EINVAL;
+	}
+
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry) {
 		GENL_SET_ERR_MSG(info, "can't allocate addr");
@@ -1333,11 +1366,12 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
 }
 
 static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
-						   struct mptcp_addr_info *addr)
+						   const struct mptcp_pm_addr_entry *entry)
 {
-	struct mptcp_sock *msk;
-	long s_slot = 0, s_num = 0;
+	const struct mptcp_addr_info *addr = &entry->addr;
 	struct mptcp_rm_list list = { .nr = 0 };
+	long s_slot = 0, s_num = 0;
+	struct mptcp_sock *msk;
 
 	pr_debug("remove_id=%d", addr->id);
 
@@ -1354,7 +1388,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
 
 		lock_sock(sk);
 		remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
-		mptcp_pm_remove_anno_addr(msk, addr, remove_subflow);
+		mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
+					  !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
 		if (remove_subflow)
 			mptcp_pm_remove_subflow(msk, &list);
 		release_sock(sk);
@@ -1367,14 +1402,6 @@ next:
 	return 0;
 }
 
-/* caller must ensure the RCU grace period is already elapsed */
-static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
-{
-	if (entry->lsk)
-		sock_release(entry->lsk);
-	kfree(entry);
-}
-
 static int mptcp_nl_remove_id_zero_address(struct net *net,
 					   struct mptcp_addr_info *addr)
 {
@@ -1451,7 +1478,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
 	__clear_bit(entry->addr.id, pernet->id_bitmap);
 	spin_unlock_bh(&pernet->lock);
 
-	mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
+	mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
 	synchronize_rcu();
 	__mptcp_pm_release_addr_entry(entry);
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 02bab8a2d5a5..1e2e8dd9f0d6 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1938,7 +1938,7 @@ backup_tests()
 	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
 	chk_join_nr "single address, backup" 1 1 1
 	chk_add_nr 1 1
-	chk_prio_nr 1 0
+	chk_prio_nr 1 1
 
 	# single address with port, backup
 	reset
@@ -1948,7 +1948,7 @@ backup_tests()
 	run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow backup
 	chk_join_nr "single address with port, backup" 1 1 1
 	chk_add_nr 1 1
-	chk_prio_nr 1 0
+	chk_prio_nr 1 1
 }
 
 add_addr_ports_tests()
-- 
cgit v1.2.3


From b530e9e1063ed2b817eae7eec6ed2daa8be11608 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Wed, 9 Mar 2022 11:53:42 +0100
Subject: bpf: Add "live packet" mode for XDP in BPF_PROG_RUN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for running XDP programs through BPF_PROG_RUN in a mode
that enables live packet processing of the resulting frames. Previous uses
of BPF_PROG_RUN for XDP returned the XDP program return code and the
modified packet data to userspace, which is useful for unit testing of XDP
programs.

The existing BPF_PROG_RUN for XDP allows userspace to set the ingress
ifindex and RXQ number as part of the context object being passed to the
kernel. This patch reuses that code, but adds a new mode with different
semantics, which can be selected with the new BPF_F_TEST_XDP_LIVE_FRAMES
flag.

When running BPF_PROG_RUN in this mode, the XDP program return codes will
be honoured: returning XDP_PASS will result in the frame being injected
into the networking stack as if it came from the selected networking
interface, while returning XDP_TX and XDP_REDIRECT will result in the frame
being transmitted out that interface. XDP_TX is translated into an
XDP_REDIRECT operation to the same interface, since the real XDP_TX action
is only possible from within the network drivers themselves, not from the
process context where BPF_PROG_RUN is executed.

Internally, this new mode of operation creates a page pool instance while
setting up the test run, and feeds pages from that into the XDP program.
The setup cost of this is amortised over the number of repetitions
specified by userspace.

To support the performance testing use case, we further optimise the setup
step so that all pages in the pool are pre-initialised with the packet
data, and pre-computed context and xdp_frame objects stored at the start of
each page. This makes it possible to entirely avoid touching the page
content on each XDP program invocation, and enables sending up to 9
Mpps/core on my test box.

Because the data pages are recycled by the page pool, and the test runner
doesn't re-initialise them for each run, subsequent invocations of the XDP
program will see the packet data in the state it was after the last time it
ran on that particular page. This means that an XDP program that modifies
the packet before redirecting it has to be careful about which assumptions
it makes about the packet content, but that is only an issue for the most
naively written programs.

Enabling the new flag is only allowed when not setting ctx_out and data_out
in the test specification, since using it means frames will be redirected
somewhere else, so they can't be returned.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220309105346.100053-2-toke@redhat.com
---
 include/uapi/linux/bpf.h       |   3 +
 kernel/bpf/Kconfig             |   1 +
 kernel/bpf/syscall.c           |   2 +-
 net/bpf/test_run.c             | 334 +++++++++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |   3 +
 5 files changed, 328 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4eebea830613..bc23020b638d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1232,6 +1232,8 @@ enum {
 
 /* If set, run the test on the cpu specified by bpf_attr.test.cpu */
 #define BPF_F_TEST_RUN_ON_CPU	(1U << 0)
+/* If set, XDP frames will be transmitted after processing */
+#define BPF_F_TEST_XDP_LIVE_FRAMES	(1U << 1)
 
 /* type for BPF_ENABLE_STATS */
 enum bpf_stats_type {
@@ -1393,6 +1395,7 @@ union bpf_attr {
 		__aligned_u64	ctx_out;
 		__u32		flags;
 		__u32		cpu;
+		__u32		batch_size;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index c3cf0b86eeb2..d56ee177d5f8 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -30,6 +30,7 @@ config BPF_SYSCALL
 	select TASKS_TRACE_RCU
 	select BINARY_PRINTF
 	select NET_SOCK_MSG if NET
+	select PAGE_POOL if NET
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index db402ebc5570..9beb585be5a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3336,7 +3336,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	}
 }
 
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
 
 static int bpf_prog_test_run(const union bpf_attr *attr,
 			     union bpf_attr __user *uattr)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index ba410b069824..25169908be4a 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,6 +15,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/net_namespace.h>
+#include <net/page_pool.h>
 #include <linux/error-injection.h>
 #include <linux/smp.h>
 #include <linux/sock_diag.h>
@@ -53,10 +54,11 @@ static void bpf_test_timer_leave(struct bpf_test_timer *t)
 	rcu_read_unlock();
 }
 
-static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration)
+static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations,
+				    u32 repeat, int *err, u32 *duration)
 	__must_hold(rcu)
 {
-	t->i++;
+	t->i += iterations;
 	if (t->i >= repeat) {
 		/* We're done. */
 		t->time_spent += ktime_get_ns() - t->time_start;
@@ -88,6 +90,286 @@ reset:
 	return false;
 }
 
+/* We put this struct at the head of each page with a context and frame
+ * initialised when the page is allocated, so we don't have to do this on each
+ * repetition of the test run.
+ */
+struct xdp_page_head {
+	struct xdp_buff orig_ctx;
+	struct xdp_buff ctx;
+	struct xdp_frame frm;
+	u8 data[];
+};
+
+struct xdp_test_data {
+	struct xdp_buff *orig_ctx;
+	struct xdp_rxq_info rxq;
+	struct net_device *dev;
+	struct page_pool *pp;
+	struct xdp_frame **frames;
+	struct sk_buff **skbs;
+	u32 batch_size;
+	u32 frame_cnt;
+};
+
+#define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)	\
+			     - sizeof(struct skb_shared_info))
+#define TEST_XDP_MAX_BATCH 256
+
+static void xdp_test_run_init_page(struct page *page, void *arg)
+{
+	struct xdp_page_head *head = phys_to_virt(page_to_phys(page));
+	struct xdp_buff *new_ctx, *orig_ctx;
+	u32 headroom = XDP_PACKET_HEADROOM;
+	struct xdp_test_data *xdp = arg;
+	size_t frm_len, meta_len;
+	struct xdp_frame *frm;
+	void *data;
+
+	orig_ctx = xdp->orig_ctx;
+	frm_len = orig_ctx->data_end - orig_ctx->data_meta;
+	meta_len = orig_ctx->data - orig_ctx->data_meta;
+	headroom -= meta_len;
+
+	new_ctx = &head->ctx;
+	frm = &head->frm;
+	data = &head->data;
+	memcpy(data + headroom, orig_ctx->data_meta, frm_len);
+
+	xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
+	xdp_prepare_buff(new_ctx, data, headroom, frm_len, true);
+	new_ctx->data = new_ctx->data_meta + meta_len;
+
+	xdp_update_frame_from_buff(new_ctx, frm);
+	frm->mem = new_ctx->rxq->mem;
+
+	memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx));
+}
+
+static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx)
+{
+	struct xdp_mem_info mem = {};
+	struct page_pool *pp;
+	int err = -ENOMEM;
+	struct page_pool_params pp_params = {
+		.order = 0,
+		.flags = 0,
+		.pool_size = xdp->batch_size,
+		.nid = NUMA_NO_NODE,
+		.max_len = TEST_XDP_FRAME_SIZE,
+		.init_callback = xdp_test_run_init_page,
+		.init_arg = xdp,
+	};
+
+	xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+	if (!xdp->frames)
+		return -ENOMEM;
+
+	xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+	if (!xdp->skbs)
+		goto err_skbs;
+
+	pp = page_pool_create(&pp_params);
+	if (IS_ERR(pp)) {
+		err = PTR_ERR(pp);
+		goto err_pp;
+	}
+
+	/* will copy 'mem.id' into pp->xdp_mem_id */
+	err = xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pp);
+	if (err)
+		goto err_mmodel;
+
+	xdp->pp = pp;
+
+	/* We create a 'fake' RXQ referencing the original dev, but with an
+	 * xdp_mem_info pointing to our page_pool
+	 */
+	xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0);
+	xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL;
+	xdp->rxq.mem.id = pp->xdp_mem_id;
+	xdp->dev = orig_ctx->rxq->dev;
+	xdp->orig_ctx = orig_ctx;
+
+	return 0;
+
+err_mmodel:
+	page_pool_destroy(pp);
+err_pp:
+	kfree(xdp->skbs);
+err_skbs:
+	kfree(xdp->frames);
+	return err;
+}
+
+static void xdp_test_run_teardown(struct xdp_test_data *xdp)
+{
+	page_pool_destroy(xdp->pp);
+	kfree(xdp->frames);
+	kfree(xdp->skbs);
+}
+
+static bool ctx_was_changed(struct xdp_page_head *head)
+{
+	return head->orig_ctx.data != head->ctx.data ||
+		head->orig_ctx.data_meta != head->ctx.data_meta ||
+		head->orig_ctx.data_end != head->ctx.data_end;
+}
+
+static void reset_ctx(struct xdp_page_head *head)
+{
+	if (likely(!ctx_was_changed(head)))
+		return;
+
+	head->ctx.data = head->orig_ctx.data;
+	head->ctx.data_meta = head->orig_ctx.data_meta;
+	head->ctx.data_end = head->orig_ctx.data_end;
+	xdp_update_frame_from_buff(&head->ctx, &head->frm);
+}
+
+static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
+			   struct sk_buff **skbs,
+			   struct net_device *dev)
+{
+	gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+	int i, n;
+	LIST_HEAD(list);
+
+	n = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, (void **)skbs);
+	if (unlikely(n == 0)) {
+		for (i = 0; i < nframes; i++)
+			xdp_return_frame(frames[i]);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nframes; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		struct sk_buff *skb = skbs[i];
+
+		skb = __xdp_build_skb_from_frame(xdpf, skb, dev);
+		if (!skb) {
+			xdp_return_frame(xdpf);
+			continue;
+		}
+
+		list_add_tail(&skb->list, &list);
+	}
+	netif_receive_skb_list(&list);
+
+	return 0;
+}
+
+static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
+			      u32 repeat)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	int err = 0, act, ret, i, nframes = 0, batch_sz;
+	struct xdp_frame **frames = xdp->frames;
+	struct xdp_page_head *head;
+	struct xdp_frame *frm;
+	bool redirect = false;
+	struct xdp_buff *ctx;
+	struct page *page;
+
+	batch_sz = min_t(u32, repeat, xdp->batch_size);
+
+	local_bh_disable();
+	xdp_set_return_frame_no_direct();
+
+	for (i = 0; i < batch_sz; i++) {
+		page = page_pool_dev_alloc_pages(xdp->pp);
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		head = phys_to_virt(page_to_phys(page));
+		reset_ctx(head);
+		ctx = &head->ctx;
+		frm = &head->frm;
+		xdp->frame_cnt++;
+
+		act = bpf_prog_run_xdp(prog, ctx);
+
+		/* if program changed pkt bounds we need to update the xdp_frame */
+		if (unlikely(ctx_was_changed(head))) {
+			ret = xdp_update_frame_from_buff(ctx, frm);
+			if (ret) {
+				xdp_return_buff(ctx);
+				continue;
+			}
+		}
+
+		switch (act) {
+		case XDP_TX:
+			/* we can't do a real XDP_TX since we're not in the
+			 * driver, so turn it into a REDIRECT back to the same
+			 * index
+			 */
+			ri->tgt_index = xdp->dev->ifindex;
+			ri->map_id = INT_MAX;
+			ri->map_type = BPF_MAP_TYPE_UNSPEC;
+			fallthrough;
+		case XDP_REDIRECT:
+			redirect = true;
+			ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog);
+			if (ret)
+				xdp_return_buff(ctx);
+			break;
+		case XDP_PASS:
+			frames[nframes++] = frm;
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(NULL, prog, act);
+			fallthrough;
+		case XDP_DROP:
+			xdp_return_buff(ctx);
+			break;
+		}
+	}
+
+out:
+	if (redirect)
+		xdp_do_flush();
+	if (nframes) {
+		ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev);
+		if (ret)
+			err = ret;
+	}
+
+	xdp_clear_return_frame_no_direct();
+	local_bh_enable();
+	return err;
+}
+
+static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
+				 u32 repeat, u32 batch_size, u32 *time)
+
+{
+	struct xdp_test_data xdp = { .batch_size = batch_size };
+	struct bpf_test_timer t = { .mode = NO_MIGRATE };
+	int ret;
+
+	if (!repeat)
+		repeat = 1;
+
+	ret = xdp_test_run_setup(&xdp, ctx);
+	if (ret)
+		return ret;
+
+	bpf_test_timer_enter(&t);
+	do {
+		xdp.frame_cnt = 0;
+		ret = xdp_test_run_batch(&xdp, prog, repeat - t.i);
+		if (unlikely(ret < 0))
+			break;
+	} while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time));
+	bpf_test_timer_leave(&t);
+
+	xdp_test_run_teardown(&xdp);
+	return ret;
+}
+
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			u32 *retval, u32 *time, bool xdp)
 {
@@ -119,7 +401,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = bpf_prog_run(prog, ctx);
-	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
+	} while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
 	bpf_reset_run_ctx(old_ctx);
 	bpf_test_timer_leave(&t);
 
@@ -446,7 +728,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	int b = 2, err = -EFAULT;
 	u32 retval = 0;
 
-	if (kattr->test.flags || kattr->test.cpu)
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
 		return -EINVAL;
 
 	switch (prog->expected_attach_type) {
@@ -510,7 +792,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 	/* doesn't support data_in/out, ctx_out, duration, or repeat */
 	if (kattr->test.data_in || kattr->test.data_out ||
 	    kattr->test.ctx_out || kattr->test.duration ||
-	    kattr->test.repeat)
+	    kattr->test.repeat || kattr->test.batch_size)
 		return -EINVAL;
 
 	if (ctx_size_in < prog->aux->max_ctx_offset ||
@@ -741,7 +1023,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	void *data;
 	int ret;
 
-	if (kattr->test.flags || kattr->test.cpu)
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
 		return -EINVAL;
 
 	data = bpf_test_init(kattr, kattr->test.data_size_in,
@@ -922,7 +1204,9 @@ static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md)
 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
+	bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES);
 	u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	u32 batch_size = kattr->test.batch_size;
 	u32 size = kattr->test.data_size_in;
 	u32 headroom = XDP_PACKET_HEADROOM;
 	u32 retval, duration, max_data_sz;
@@ -938,6 +1222,18 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	    prog->expected_attach_type == BPF_XDP_CPUMAP)
 		return -EINVAL;
 
+	if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES)
+		return -EINVAL;
+
+	if (do_live) {
+		if (!batch_size)
+			batch_size = NAPI_POLL_WEIGHT;
+		else if (batch_size > TEST_XDP_MAX_BATCH)
+			return -E2BIG;
+	} else if (batch_size) {
+		return -EINVAL;
+	}
+
 	ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
@@ -946,14 +1242,20 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 		/* There can't be user provided data before the meta data */
 		if (ctx->data_meta || ctx->data_end != size ||
 		    ctx->data > ctx->data_end ||
-		    unlikely(xdp_metalen_invalid(ctx->data)))
+		    unlikely(xdp_metalen_invalid(ctx->data)) ||
+		    (do_live && (kattr->test.data_out || kattr->test.ctx_out)))
 			goto free_ctx;
 		/* Meta data is allocated from the headroom */
 		headroom -= ctx->data;
 	}
 
 	max_data_sz = 4096 - headroom - tailroom;
-	size = min_t(u32, size, max_data_sz);
+	if (size > max_data_sz) {
+		/* disallow live data mode for jumbo frames */
+		if (do_live)
+			goto free_ctx;
+		size = max_data_sz;
+	}
 
 	data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom);
 	if (IS_ERR(data)) {
@@ -1011,7 +1313,10 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (repeat > 1)
 		bpf_prog_change_xdp(NULL, prog);
 
-	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
+	if (do_live)
+		ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration);
+	else
+		ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	/* We convert the xdp_buff back to an xdp_md before checking the return
 	 * code so the reference count of any held netdevice will be decremented
 	 * even if the test run failed.
@@ -1073,7 +1378,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
 		return -EINVAL;
 
-	if (kattr->test.flags || kattr->test.cpu)
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
 		return -EINVAL;
 
 	if (size < ETH_HLEN)
@@ -1108,7 +1413,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	do {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
 					  size, flags);
-	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+	} while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
 	bpf_test_timer_leave(&t);
 
 	if (ret < 0)
@@ -1140,7 +1445,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
 	if (prog->type != BPF_PROG_TYPE_SK_LOOKUP)
 		return -EINVAL;
 
-	if (kattr->test.flags || kattr->test.cpu)
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
 		return -EINVAL;
 
 	if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
@@ -1203,7 +1508,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
 	do {
 		ctx.selected_sk = NULL;
 		retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run);
-	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+	} while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
 	bpf_test_timer_leave(&t);
 
 	if (ret < 0)
@@ -1242,7 +1547,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 	/* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
 	if (kattr->test.data_in || kattr->test.data_out ||
 	    kattr->test.ctx_out || kattr->test.duration ||
-	    kattr->test.repeat || kattr->test.flags)
+	    kattr->test.repeat || kattr->test.flags ||
+	    kattr->test.batch_size)
 		return -EINVAL;
 
 	if (ctx_size_in < prog->aux->max_ctx_offset ||
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4eebea830613..bc23020b638d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1232,6 +1232,8 @@ enum {
 
 /* If set, run the test on the cpu specified by bpf_attr.test.cpu */
 #define BPF_F_TEST_RUN_ON_CPU	(1U << 0)
+/* If set, XDP frames will be transmitted after processing */
+#define BPF_F_TEST_XDP_LIVE_FRAMES	(1U << 1)
 
 /* type for BPF_ENABLE_STATS */
 enum bpf_stats_type {
@@ -1393,6 +1395,7 @@ union bpf_attr {
 		__aligned_u64	ctx_out;
 		__u32		flags;
 		__u32		cpu;
+		__u32		batch_size;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
-- 
cgit v1.2.3


From 530e0d46c61314c59ecfdb8d3bcb87edbc0f85d3 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Wed, 9 Mar 2022 13:04:13 +0100
Subject: can: isotp: set default value for N_As to 50 micro seconds

The N_As value describes the time a CAN frame needs on the wire when
transmitted by the CAN controller. Even very short CAN FD frames need
arround 100 usecs (bitrate 1Mbit/s, data bitrate 8Mbit/s).

Having N_As to be zero (the former default) leads to 'no CAN frame
separation' when STmin is set to zero by the receiving node. This 'burst
mode' should not be enabled by default as it could potentially dump a high
number of CAN frames into the netdev queue from the soft hrtimer context.
This does not affect the system stability but is just not nice and
cooperative.

With this N_As/frame_txtime value the 'burst mode' is disabled by default.

As user space applications usually do not set the frame_txtime element
of struct can_isotp_options the new in-kernel default is very likely
overwritten with zero when the sockopt() CAN_ISOTP_OPTS is invoked.
To make sure that a N_As value of zero is only set intentional the
value '0' is now interpreted as 'do not change the current value'.
When a frame_txtime of zero is required for testing purposes this
CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime.

Link: https://lore.kernel.org/all/20220309120416.83514-2-socketcan@hartkopp.net
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/isotp.h | 28 ++++++++++++++++++++++------
 net/can/isotp.c                | 12 +++++++++++-
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h
index c55935b64ccc..590f8aea2b6d 100644
--- a/include/uapi/linux/can/isotp.h
+++ b/include/uapi/linux/can/isotp.h
@@ -137,20 +137,16 @@ struct can_isotp_ll_options {
 #define CAN_ISOTP_WAIT_TX_DONE	0x400	/* wait for tx completion */
 #define CAN_ISOTP_SF_BROADCAST	0x800	/* 1-to-N functional addressing */
 
-/* default values */
+/* protocol machine default values */
 
 #define CAN_ISOTP_DEFAULT_FLAGS		0
 #define CAN_ISOTP_DEFAULT_EXT_ADDRESS	0x00
 #define CAN_ISOTP_DEFAULT_PAD_CONTENT	0xCC /* prevent bit-stuffing */
-#define CAN_ISOTP_DEFAULT_FRAME_TXTIME	0
+#define CAN_ISOTP_DEFAULT_FRAME_TXTIME	50000 /* 50 micro seconds */
 #define CAN_ISOTP_DEFAULT_RECV_BS	0
 #define CAN_ISOTP_DEFAULT_RECV_STMIN	0x00
 #define CAN_ISOTP_DEFAULT_RECV_WFTMAX	0
 
-#define CAN_ISOTP_DEFAULT_LL_MTU	CAN_MTU
-#define CAN_ISOTP_DEFAULT_LL_TX_DL	CAN_MAX_DLEN
-#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS	0
-
 /*
  * Remark on CAN_ISOTP_DEFAULT_RECV_* values:
  *
@@ -162,4 +158,24 @@ struct can_isotp_ll_options {
  * consistency and copied directly into the flow control (FC) frame.
  */
 
+/* link layer default values => make use of Classical CAN frames */
+
+#define CAN_ISOTP_DEFAULT_LL_MTU	CAN_MTU
+#define CAN_ISOTP_DEFAULT_LL_TX_DL	CAN_MAX_DLEN
+#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS	0
+
+/*
+ * The CAN_ISOTP_DEFAULT_FRAME_TXTIME has become a non-zero value as
+ * it only makes sense for isotp implementation tests to run without
+ * a N_As value. As user space applications usually do not set the
+ * frame_txtime element of struct can_isotp_options the new in-kernel
+ * default is very likely overwritten with zero when the sockopt()
+ * CAN_ISOTP_OPTS is invoked.
+ * To make sure that a N_As value of zero is only set intentional the
+ * value '0' is now interpreted as 'do not change the current value'.
+ * When a frame_txtime of zero is required for testing purposes this
+ * CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime.
+ */
+#define CAN_ISOTP_FRAME_TXTIME_ZERO	0xFFFFFFFF
+
 #endif /* !_UAPI_CAN_ISOTP_H */
diff --git a/net/can/isotp.c b/net/can/isotp.c
index d59f1758ac9c..47404ba59981 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -140,6 +140,7 @@ struct isotp_sock {
 	struct can_isotp_options opt;
 	struct can_isotp_fc_options rxfc, txfc;
 	struct can_isotp_ll_options ll;
+	u32 frame_txtime;
 	u32 force_tx_stmin;
 	u32 force_rx_stmin;
 	u32 cfecho; /* consecutive frame echo tag */
@@ -360,7 +361,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
 
 		so->tx_gap = ktime_set(0, 0);
 		/* add transmission time for CAN frame N_As */
-		so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime);
+		so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime);
 		/* add waiting time for consecutive frames N_Cs */
 		if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
 			so->tx_gap = ktime_add_ns(so->tx_gap,
@@ -1293,6 +1294,14 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
 		/* no separate rx_ext_address is given => use ext_address */
 		if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
 			so->opt.rx_ext_address = so->opt.ext_address;
+
+		/* check for frame_txtime changes (0 => no changes) */
+		if (so->opt.frame_txtime) {
+			if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
+				so->frame_txtime = 0;
+			else
+				so->frame_txtime = so->opt.frame_txtime;
+		}
 		break;
 
 	case CAN_ISOTP_RECV_FC:
@@ -1498,6 +1507,7 @@ static int isotp_init(struct sock *sk)
 	so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
 	so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
 	so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+	so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
 	so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
 	so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
 	so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
-- 
cgit v1.2.3


From e7a6c00dc77aedf27a601738ea509f1caea6d673 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 4 Mar 2022 08:22:22 -0700
Subject: io_uring: add support for registering ring file descriptors

Lots of workloads use multiple threads, in which case the file table is
shared between them. This makes getting and putting the ring file
descriptor for each io_uring_enter(2) system call more expensive, as it
involves an atomic get and put for each call.

Similarly to how we allow registering normal file descriptors to avoid
this overhead, add support for an io_uring_register(2) API that allows
to register the ring fds themselves:

1) IORING_REGISTER_RING_FDS - takes an array of io_uring_rsrc_update
   structs, and registers them with the task.
2) IORING_UNREGISTER_RING_FDS - takes an array of io_uring_src_update
   structs, and unregisters them.

When a ring fd is registered, it is internally represented by an offset.
This offset is returned to the application, and the application then
uses this offset and sets IORING_ENTER_REGISTERED_RING for the
io_uring_enter(2) system call. This works just like using a registered
file descriptor, rather than a real one, in an SQE, where
IOSQE_FIXED_FILE gets set to tell io_uring that we're using an internal
offset/descriptor rather than a real file descriptor.

In initial testing, this provides a nice bump in performance for
threaded applications in real world cases where the batch count (eg
number of requests submitted per io_uring_enter(2) invocation) is low.
In a microbenchmark, submitting NOP requests, we see the following
increases in performance:

Requests per syscall	Baseline	Registered	Increase
----------------------------------------------------------------
1			 ~7030K		 ~8080K		+15%
2			~13120K		~14800K		+13%
4			~22740K		~25300K		+11%

Co-developed-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 182 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/io_uring.h      |   5 +-
 include/uapi/linux/io_uring.h |  13 ++-
 3 files changed, 190 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8fd3b2bb30d..daa3609b742f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,11 @@ struct io_ring_ctx {
 	};
 };
 
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +486,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8788,8 +8794,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8798,6 +8812,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8822,6 +8837,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10043,6 +10059,139 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+				     int start, int end)
+{
+	struct file *file;
+	int offset;
+
+	for (offset = start; offset < end; offset++) {
+		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[offset])
+			continue;
+
+		file = fget(fd);
+		if (!file) {
+			return -EBADF;
+		} else if (file->f_op != &io_uring_fops) {
+			fput(file);
+			return -EOPNOTSUPP;
+		}
+		tctx->registered_rings[offset] = file;
+		return offset;
+	}
+
+	return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+			      unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	int ret, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (ret)
+		return ret;
+
+	tctx = current->io_uring;
+	for (i = 0; i < nr_args; i++) {
+		int start, end;
+
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (reg.offset == -1U) {
+			start = 0;
+			end = IO_RINGFD_REG_MAX;
+		} else {
+			if (reg.offset >= IO_RINGFD_REG_MAX) {
+				ret = -EINVAL;
+				break;
+			}
+			start = reg.offset;
+			end = start + 1;
+		}
+
+		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+		if (ret < 0)
+			break;
+
+		reg.offset = ret;
+		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+				unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_uring_rsrc_update reg;
+	int ret = 0, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+	if (!tctx)
+		return 0;
+
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[reg.offset]) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+		}
+	}
+
+	return i ? i : ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10173,12 +10322,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	/*
+	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+	 * need only dereference our task private array to find it.
+	 */
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (!tctx || fd >= IO_RINGFD_REG_MAX)
+			return -EINVAL;
+		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10252,7 +10417,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11142,6 +11308,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_RING_FDS:
+		ret = io_ringfd_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_RING_FDS:
+		ret = io_ringfd_unregister(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..42b2fe84dbcd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_RING_FDS		= 20,
+	IORING_UNREGISTER_RING_FDS		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
-- 
cgit v1.2.3


From 4f57f06ce2186c31c3da52386125dc57b1cd6f96 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Mar 2022 06:27:26 -0700
Subject: io_uring: add support for IORING_OP_MSG_RING command

This adds support for IORING_OP_MSG_RING, which allows an SQE to signal
another ring. That allows either waking up someone waiting on the ring,
or even passing a 64-bit value via the user_data field in the CQE.

sqe->fd must contain the fd of a ring that should receive the CQE.
sqe->off will be propagated to the cqe->user_data on the target ring,
and sqe->len will be propagated to cqe->res. The results CQE will have
IORING_CQE_F_MSG set in its flags, to indicate that this CQE was generated
from a messaging request rather than a SQE issued locally on that ring.
This effectively allows passing a 64-bit and a 32-bit quantify between
the two rings.

This request type has the following request specific error cases:

- -EBADFD. Set if the sqe->fd doesn't point to a file descriptor that is
  of the io_uring type.
- -EOVERFLOW. Set if we were not able to deliver a request to the target
  ring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  3 +++
 2 files changed, 58 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0f5e999e569f..36b001365a79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -706,6 +706,12 @@ struct io_hardlink {
 	int				flags;
 };
 
+struct io_msg {
+	struct file			*file;
+	u64 user_data;
+	u32 len;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -871,6 +877,7 @@ struct io_kiocb {
 		struct io_mkdir		mkdir;
 		struct io_symlink	symlink;
 		struct io_hardlink	hardlink;
+		struct io_msg		msg;
 	};
 
 	u8				opcode;
@@ -1121,6 +1128,9 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_MKDIRAT] = {},
 	[IORING_OP_SYMLINKAT] = {},
 	[IORING_OP_LINKAT] = {},
+	[IORING_OP_MSG_RING] = {
+		.needs_file		= 1,
+	},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -4322,6 +4332,46 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static int io_msg_ring_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+		     sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index ||
+		     sqe->personality))
+		return -EINVAL;
+
+	if (req->file->f_op != &io_uring_fops)
+		return -EBADFD;
+
+	req->msg.user_data = READ_ONCE(sqe->off);
+	req->msg.len = READ_ONCE(sqe->len);
+	return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *target_ctx;
+	struct io_msg *msg = &req->msg;
+	int ret = -EOVERFLOW;
+	bool filled;
+
+	target_ctx = req->file->private_data;
+
+	spin_lock(&target_ctx->completion_lock);
+	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len,
+					IORING_CQE_F_MSG);
+	io_commit_cqring(target_ctx);
+	spin_unlock(&target_ctx->completion_lock);
+
+	if (filled) {
+		io_cqring_ev_posted(target_ctx);
+		ret = 0;
+	}
+
+	__io_req_complete(req, issue_flags, ret, 0);
+	return 0;
+}
+
 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -6700,6 +6750,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_symlinkat_prep(req, sqe);
 	case IORING_OP_LINKAT:
 		return io_linkat_prep(req, sqe);
+	case IORING_OP_MSG_RING:
+		return io_msg_ring_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6983,6 +7035,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_LINKAT:
 		ret = io_linkat(req, issue_flags);
 		break;
+	case IORING_OP_MSG_RING:
+		ret = io_msg_ring(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 42b2fe84dbcd..8bd4bfdd9a89 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -143,6 +143,7 @@ enum {
 	IORING_OP_MKDIRAT,
 	IORING_OP_SYMLINKAT,
 	IORING_OP_LINKAT,
+	IORING_OP_MSG_RING,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -199,9 +200,11 @@ struct io_uring_cqe {
  *
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
  * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
+ * IORING_CQE_F_MSG	If set, CQE was generated with IORING_OP_MSG_RING
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
+#define IORING_CQE_F_MSG		(1U << 2)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
-- 
cgit v1.2.3


From 153474ba1a4aed0a7b797b4c2be8c35c7a4e57bd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 11:46:37 -0600
Subject: ptrace: Create ptrace_report_syscall_{entry,exit} in ptrace.h

Rename tracehook_report_syscall_{entry,exit} to
ptrace_report_syscall_{entry,exit} and place them in ptrace.h

There is no longer any generic tracehook infractructure so make
these ptrace specific functions ptrace specific.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-3-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/Kconfig                        |  2 +-
 arch/alpha/kernel/ptrace.c          |  5 ++--
 arch/arc/kernel/ptrace.c            |  5 ++--
 arch/arm/kernel/ptrace.c            |  5 ++--
 arch/arm64/kernel/ptrace.c          |  7 +++--
 arch/csky/kernel/ptrace.c           |  5 ++--
 arch/h8300/kernel/ptrace.c          |  5 ++--
 arch/hexagon/kernel/traps.c         |  6 ++---
 arch/ia64/kernel/ptrace.c           |  4 +--
 arch/m68k/kernel/ptrace.c           |  6 ++---
 arch/microblaze/kernel/ptrace.c     |  5 ++--
 arch/mips/kernel/ptrace.c           |  5 ++--
 arch/nds32/include/asm/syscall.h    |  2 +-
 arch/nds32/kernel/ptrace.c          |  5 ++--
 arch/nios2/kernel/ptrace.c          |  5 ++--
 arch/openrisc/kernel/ptrace.c       |  5 ++--
 arch/parisc/kernel/ptrace.c         |  7 +++--
 arch/powerpc/kernel/ptrace/ptrace.c |  8 +++---
 arch/riscv/kernel/ptrace.c          |  5 ++--
 arch/sh/kernel/ptrace_32.c          |  5 ++--
 arch/sparc/kernel/ptrace_32.c       |  5 ++--
 arch/sparc/kernel/ptrace_64.c       |  5 ++--
 arch/um/kernel/ptrace.c             |  5 ++--
 arch/xtensa/kernel/ptrace.c         |  5 ++--
 include/asm-generic/syscall.h       |  2 +-
 include/linux/entry-common.h        |  6 ++---
 include/linux/ptrace.h              | 51 +++++++++++++++++++++++++++++++++++++
 include/linux/tracehook.h           | 51 -------------------------------------
 include/uapi/linux/ptrace.h         |  2 +-
 kernel/entry/common.c               |  1 +
 30 files changed, 109 insertions(+), 126 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 678a80713b21..a517a949eb1d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -217,7 +217,7 @@ config TRACE_IRQFLAGS_SUPPORT
 #	asm/syscall.h		supplying asm-generic/syscall.h interface
 #	linux/regset.h		user_regset interfaces
 #	CORE_DUMP_USE_REGSET	#define'd in linux/elf.h
-#	TIF_SYSCALL_TRACE	calls tracehook_report_syscall_{entry,exit}
+#	TIF_SYSCALL_TRACE	calls ptrace_report_syscall_{entry,exit}
 #	TIF_NOTIFY_RESUME	calls tracehook_notify_resume()
 #	signal delivery		calls tracehook_signal_handler()
 #
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 8c43212ae38e..a1a239ea002d 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
 #include <linux/audit.h>
 
 #include <linux/uaccess.h>
@@ -323,7 +322,7 @@ asmlinkage unsigned long syscall_trace_enter(void)
 	unsigned long ret = 0;
 	struct pt_regs *regs = current_pt_regs();
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(current_pt_regs()))
+	    ptrace_report_syscall_entry(current_pt_regs()))
 		ret = -1UL;
 	audit_syscall_entry(regs->r0, regs->r16, regs->r17, regs->r18, regs->r19);
 	return ret ?: current_pt_regs()->r0;
@@ -334,5 +333,5 @@ syscall_trace_leave(void)
 {
 	audit_syscall_exit(current_pt_regs());
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(current_pt_regs(), 0);
+		ptrace_report_syscall_exit(current_pt_regs(), 0);
 }
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index 883391977fdf..54b419ac8bda 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -4,7 +4,6 @@
  */
 
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/sched/task_stack.h>
 #include <linux/regset.h>
 #include <linux/unistd.h>
@@ -258,7 +257,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 asmlinkage int syscall_trace_entry(struct pt_regs *regs)
 {
-	if (tracehook_report_syscall_entry(regs))
+	if (ptrace_report_syscall_entry(regs))
 		return ULONG_MAX;
 
 	return regs->r8;
@@ -266,5 +265,5 @@ asmlinkage int syscall_trace_entry(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 {
-	tracehook_report_syscall_exit(regs, 0);
+	ptrace_report_syscall_exit(regs, 0);
 }
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index e5aa3237853d..bfe88c6e60d5 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -22,7 +22,6 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/unistd.h>
 
 #include <asm/syscall.h>
@@ -843,8 +842,8 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 	regs->ARM_ip = dir;
 
 	if (dir == PTRACE_SYSCALL_EXIT)
-		tracehook_report_syscall_exit(regs, 0);
-	else if (tracehook_report_syscall_entry(regs))
+		ptrace_report_syscall_exit(regs, 0);
+	else if (ptrace_report_syscall_entry(regs))
 		current_thread_info()->abi_syscall = -1;
 
 	regs->ARM_ip = ip;
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b7845575f86f..230a47b9189e 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -27,7 +27,6 @@
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 
 #include <asm/compat.h>
@@ -1818,11 +1817,11 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 	regs->regs[regno] = dir;
 
 	if (dir == PTRACE_SYSCALL_ENTER) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			forget_syscall(regs);
 		regs->regs[regno] = saved_reg;
 	} else if (!test_thread_flag(TIF_SINGLESTEP)) {
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 		regs->regs[regno] = saved_reg;
 	} else {
 		regs->regs[regno] = saved_reg;
@@ -1832,7 +1831,7 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 		 * tracer modifications to the registers may have rewound the
 		 * state machine.
 		 */
-		tracehook_report_syscall_exit(regs, 1);
+		ptrace_report_syscall_exit(regs, 1);
 	}
 }
 
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index 1a5f54e0d272..0f7e7b653c72 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -12,7 +12,6 @@
 #include <linux/sched/task_stack.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
 
@@ -321,7 +320,7 @@ long arch_ptrace(struct task_struct *child, long request,
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 
 	if (secure_computing() == -1)
@@ -339,7 +338,7 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 	audit_syscall_exit(regs);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index a11db009d0ea..a9898b27b756 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -12,7 +12,6 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
 
@@ -174,7 +173,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -196,5 +195,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 1240f038cce0..6447763ce5a9 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -14,7 +14,7 @@
 #include <linux/kdebug.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <asm/traps.h>
 #include <asm/vm_fault.h>
 #include <asm/syscall.h>
@@ -348,7 +348,7 @@ void do_trap0(struct pt_regs *regs)
 
 		/* allow strace to catch syscall args  */
 		if (unlikely(test_thread_flag(TIF_SYSCALL_TRACE) &&
-			tracehook_report_syscall_entry(regs)))
+			ptrace_report_syscall_entry(regs)))
 			return;  /*  return -ENOSYS somewhere?  */
 
 		/* Interrupts should be re-enabled for syscall processing */
@@ -386,7 +386,7 @@ void do_trap0(struct pt_regs *regs)
 
 		/* allow strace to get the syscall return state  */
 		if (unlikely(test_thread_flag(TIF_SYSCALL_TRACE)))
-			tracehook_report_syscall_exit(regs, 0);
+			ptrace_report_syscall_exit(regs, 0);
 
 		break;
 	case TRAP_DEBUG:
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6a1439eaa050..6af64aae087d 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1217,7 +1217,7 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 		     struct pt_regs regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(&regs))
+		if (ptrace_report_syscall_entry(&regs))
 			return -ENOSYS;
 
 	/* copy user rbs to kernel rbs */
@@ -1243,7 +1243,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(&regs, step);
+		ptrace_report_syscall_exit(&regs, step);
 
 	/* copy user rbs to kernel rbs */
 	if (test_thread_flag(TIF_RESTORE_RSE))
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index aa3a0b8d07e9..a0c99fe3118e 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -19,7 +19,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 
 #include <linux/uaccess.h>
 #include <asm/page.h>
@@ -282,13 +282,13 @@ asmlinkage int syscall_trace_enter(void)
 	int ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(task_pt_regs(current));
+		ret = ptrace_report_syscall_entry(task_pt_regs(current));
 	return ret;
 }
 
 asmlinkage void syscall_trace_leave(void)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(task_pt_regs(current), 0);
+		ptrace_report_syscall_exit(task_pt_regs(current), 0);
 }
 #endif /* CONFIG_COLDFIRE */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index badd286882ae..5234d0c1dcaa 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -33,7 +33,6 @@
 #include <linux/elf.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
-#include <linux/tracehook.h>
 
 #include <linux/errno.h>
 #include <asm/processor.h>
@@ -140,7 +139,7 @@ asmlinkage unsigned long do_syscall_trace_enter(struct pt_regs *regs)
 	secure_computing_strict(regs->r12);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -161,7 +160,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 void ptrace_disable(struct task_struct *child)
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index db7c5be1d4a3..567aec4abac0 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -27,7 +27,6 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/stddef.h>
-#include <linux/tracehook.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/ftrace.h>
@@ -1317,7 +1316,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 	current_thread_info()->syscall = syscall;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 		syscall = current_thread_info()->syscall;
 	}
@@ -1376,7 +1375,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 		trace_sys_exit(regs, regs_return_value(regs));
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	user_enter();
 }
diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h
index 90aa56c94af1..04d55ce18d50 100644
--- a/arch/nds32/include/asm/syscall.h
+++ b/arch/nds32/include/asm/syscall.h
@@ -39,7 +39,7 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
  *
  * It's only valid to call this when @task is stopped for system
  * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
- * after tracehook_report_syscall_entry() returned nonzero to prevent
+ * after ptrace_report_syscall_entry() returned nonzero to prevent
  * the system call from taking place.
  *
  * This rolls back the register state in @regs so it's as if the
diff --git a/arch/nds32/kernel/ptrace.c b/arch/nds32/kernel/ptrace.c
index d0eda870fbc2..6a6988cf689d 100644
--- a/arch/nds32/kernel/ptrace.c
+++ b/arch/nds32/kernel/ptrace.c
@@ -3,7 +3,6 @@
 
 #include <linux/ptrace.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/sched/task_stack.h>
 
@@ -103,7 +102,7 @@ void user_disable_single_step(struct task_struct *child)
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			forget_syscall(regs);
 	}
 	return regs->syscallno;
@@ -113,6 +112,6 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	int step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 
 }
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index a6ea9e1b4f61..cd62f310778b 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/regset.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
 
@@ -134,7 +133,7 @@ asmlinkage int do_syscall_trace_enter(void)
 	int ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(task_pt_regs(current));
+		ret = ptrace_report_syscall_entry(task_pt_regs(current));
 
 	return ret;
 }
@@ -142,5 +141,5 @@ asmlinkage int do_syscall_trace_enter(void)
 asmlinkage void do_syscall_trace_exit(void)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(task_pt_regs(current), 0);
+		ptrace_report_syscall_exit(task_pt_regs(current), 0);
 }
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 4d60ae2a12fa..b971740fc2aa 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -22,7 +22,6 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 
 #include <asm/thread_info.h>
@@ -159,7 +158,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -181,5 +180,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 65de6c4c9354..96ef6a6b66e5 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/personality.h>
 #include <linux/regset.h>
@@ -316,7 +315,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		int rc = tracehook_report_syscall_entry(regs);
+		int rc = ptrace_report_syscall_entry(regs);
 
 		/*
 		 * As tracesys_next does not set %r28 to -ENOSYS
@@ -327,7 +326,7 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 		if (rc) {
 			/*
 			 * A nonzero return code from
-			 * tracehook_report_syscall_entry() tells us
+			 * ptrace_report_syscall_entry() tells us
 			 * to prevent the syscall execution.  Skip
 			 * the syscall call and the syscall restart handling.
 			 *
@@ -381,7 +380,7 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 #endif
 
 	if (stepping || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, stepping);
+		ptrace_report_syscall_exit(regs, stepping);
 }
 
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
index c43f77e2ac31..f394b0d6473f 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/regset.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/context_tracking.h>
 #include <linux/syscalls.h>
@@ -263,12 +263,12 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	flags = read_thread_flags() & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE);
 
 	if (flags) {
-		int rc = tracehook_report_syscall_entry(regs);
+		int rc = ptrace_report_syscall_entry(regs);
 
 		if (unlikely(flags & _TIF_SYSCALL_EMU)) {
 			/*
 			 * A nonzero return code from
-			 * tracehook_report_syscall_entry() tells us to prevent
+			 * ptrace_report_syscall_entry() tells us to prevent
 			 * the syscall execution, but we are not going to
 			 * execute it anyway.
 			 *
@@ -334,7 +334,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 void __init pt_regs_check(void);
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index a89243730153..793c7da0554b 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -17,7 +17,6 @@
 #include <linux/regset.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
-#include <linux/tracehook.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -241,7 +240,7 @@ long arch_ptrace(struct task_struct *child, long request,
 __visible int do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 
 	/*
@@ -266,7 +265,7 @@ __visible void do_syscall_trace_exit(struct pt_regs *regs)
 	audit_syscall_exit(regs);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 5281685f6ad1..d417988d9770 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -20,7 +20,6 @@
 #include <linux/io.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
@@ -456,7 +455,7 @@ long arch_ptrace(struct task_struct *child, long request,
 asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs)) {
+	    ptrace_report_syscall_entry(regs)) {
 		regs->regs[0] = -ENOSYS;
 		return -1;
 	}
@@ -484,5 +483,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c
index 5318174a0268..e7db48acb838 100644
--- a/arch/sparc/kernel/ptrace_32.c
+++ b/arch/sparc/kernel/ptrace_32.c
@@ -21,7 +21,6 @@
 #include <linux/signal.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
-#include <linux/tracehook.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -439,9 +438,9 @@ asmlinkage int syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
 		if (syscall_exit_p)
-			tracehook_report_syscall_exit(regs, 0);
+			ptrace_report_syscall_exit(regs, 0);
 		else
-			ret = tracehook_report_syscall_entry(regs);
+			ret = ptrace_report_syscall_entry(regs);
 	}
 
 	return ret;
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 2b92155db8a5..86a7eb5c27ba 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -25,7 +25,6 @@
 #include <linux/audit.h>
 #include <linux/signal.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <trace/syscall.h>
 #include <linux/compat.h>
 #include <linux/elf.h>
@@ -1095,7 +1094,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 		user_exit();
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(regs);
+		ret = ptrace_report_syscall_entry(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
@@ -1118,7 +1117,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 		trace_sys_exit(regs, regs->u_regs[UREG_I0]);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	if (test_thread_flag(TIF_NOHZ))
 		user_enter();
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index b425f47bddbb..bfaf6ab1ac03 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -6,7 +6,6 @@
 #include <linux/audit.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 
@@ -135,7 +134,7 @@ int syscall_trace_enter(struct pt_regs *regs)
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return 0;
 
-	return tracehook_report_syscall_entry(regs);
+	return ptrace_report_syscall_entry(regs);
 }
 
 void syscall_trace_leave(struct pt_regs *regs)
@@ -151,7 +150,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 
-	tracehook_report_syscall_exit(regs, 0);
+	ptrace_report_syscall_exit(regs, 0);
 	/* force do_signal() --> is_syscall() */
 	if (ptraced & PT_PTRACED)
 		set_thread_flag(TIF_SIGPENDING);
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index bb3f4797d212..323c678a691f 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -26,7 +26,6 @@
 #include <linux/security.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 
 #define CREATE_TRACE_POINTS
@@ -550,7 +549,7 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 		regs->areg[2] = -ENOSYS;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs)) {
+	    ptrace_report_syscall_entry(regs)) {
 		regs->areg[2] = -ENOSYS;
 		regs->syscall = NO_SYSCALL;
 		return 0;
@@ -583,5 +582,5 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 	step = test_thread_flag(TIF_SINGLESTEP);
 
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 81695eb02a12..5a80fe728dc8 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -44,7 +44,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
  *
  * It's only valid to call this when @task is stopped for system
  * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or
- * %SYSCALL_WORK_SYSCALL_AUDIT), after tracehook_report_syscall_entry()
+ * %SYSCALL_WORK_SYSCALL_AUDIT), after ptrace_report_syscall_entry()
  * returned nonzero to prevent the system call from taking place.
  *
  * This rolls back the register state in @regs so it's as if the
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 2e2b8d6140ed..a670e9fba7a9 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,7 +3,7 @@
 #define __LINUX_ENTRYCOMMON_H
 
 #include <linux/static_call_types.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
@@ -95,7 +95,7 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
 #ifndef arch_syscall_enter_tracehook
 static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs)
 {
-	return tracehook_report_syscall_entry(regs);
+	return ptrace_report_syscall_entry(regs);
 }
 #endif
 
@@ -294,7 +294,7 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step);
 #ifndef arch_syscall_exit_tracehook
 static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
 {
-	tracehook_report_syscall_exit(regs, step);
+	ptrace_report_syscall_exit(regs, step);
 }
 #endif
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 91b1074edb4c..5310f43e4762 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -440,4 +440,55 @@ static inline int ptrace_report_syscall(unsigned long message)
 	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
+
+/**
+ * ptrace_report_syscall_entry - task is about to attempt a system call
+ * @regs:		user register state of current task
+ *
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
+ * entered the kernel for a system call.  Full user register state is
+ * available here.  Changing the values in @regs can affect the system
+ * call number and arguments to be tried.  It is safe to block here,
+ * preventing the system call from beginning.
+ *
+ * Returns zero normally, or nonzero if the calling arch code should abort
+ * the system call.  That must prevent normal entry so no system call is
+ * made.  If @task ever returns to user mode after this, its register state
+ * is unspecified, but should be something harmless like an %ENOSYS error
+ * return.  It should preserve enough information so that syscall_rollback()
+ * can work (see asm-generic/syscall.h).
+ *
+ * Called without locks, just after entering kernel mode.
+ */
+static inline __must_check int ptrace_report_syscall_entry(
+	struct pt_regs *regs)
+{
+	return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
+}
+
+/**
+ * ptrace_report_syscall_exit - task has just finished a system call
+ * @regs:		user register state of current task
+ * @step:		nonzero if simulating single-step or block-step
+ *
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
+ * the current task has just finished an attempted system call.  Full
+ * user register state is available here.  It is safe to block here,
+ * preventing signals from being processed.
+ *
+ * If @step is nonzero, this report is also in lieu of the normal
+ * trap that would follow the system call instruction because
+ * user_enable_block_step() or user_enable_single_step() was used.
+ * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
+ *
+ * Called without locks, just before checking for pending signals.
+ */
+static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step)
+{
+	if (step)
+		user_single_step_report(regs);
+	else
+		ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
+}
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 998bc3863559..819e82ac09bd 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -52,57 +52,6 @@
 struct linux_binprm;
 
 
-/**
- * tracehook_report_syscall_entry - task is about to attempt a system call
- * @regs:		user register state of current task
- *
- * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
- * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
- * entered the kernel for a system call.  Full user register state is
- * available here.  Changing the values in @regs can affect the system
- * call number and arguments to be tried.  It is safe to block here,
- * preventing the system call from beginning.
- *
- * Returns zero normally, or nonzero if the calling arch code should abort
- * the system call.  That must prevent normal entry so no system call is
- * made.  If @task ever returns to user mode after this, its register state
- * is unspecified, but should be something harmless like an %ENOSYS error
- * return.  It should preserve enough information so that syscall_rollback()
- * can work (see asm-generic/syscall.h).
- *
- * Called without locks, just after entering kernel mode.
- */
-static inline __must_check int tracehook_report_syscall_entry(
-	struct pt_regs *regs)
-{
-	return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
-}
-
-/**
- * tracehook_report_syscall_exit - task has just finished a system call
- * @regs:		user register state of current task
- * @step:		nonzero if simulating single-step or block-step
- *
- * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
- * the current task has just finished an attempted system call.  Full
- * user register state is available here.  It is safe to block here,
- * preventing signals from being processed.
- *
- * If @step is nonzero, this report is also in lieu of the normal
- * trap that would follow the system call instruction because
- * user_enable_block_step() or user_enable_single_step() was used.
- * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
- *
- * Called without locks, just before checking for pending signals.
- */
-static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
-{
-	if (step)
-		user_single_step_report(regs);
-	else
-		ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @stepping:		nonzero if debugger single-step or block-step in use
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 3747bf816f9a..b7af92e07d1f 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -114,7 +114,7 @@ struct ptrace_rseq_configuration {
 
 /*
  * These values are stored in task->ptrace_message
- * by tracehook_report_syscall_* to describe the current syscall-stop.
+ * by ptrace_report_syscall_* to describe the current syscall-stop.
  */
 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..f52e57c4d6d8 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
+#include <linux/tracehook.h>
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
-- 
cgit v1.2.3


From bcbb7bf6ccde7cb969a5642879832bc84ebf06a3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Mar 2022 12:59:35 -0700
Subject: io_uring: allow submissions to continue on error

By default, io_uring will stop submitting a batch of requests if we run
into an error submitting a request. This isn't strictly necessary, as
the error result is passed out-of-band via a CQE anyway. And it can be
a bit confusing for some applications.

Provide a way to setup a ring that will continue submitting on error,
when the error CQE has been posted.

There's still one case that will break out of submission. If we fail
allocating a request, then we'll still return -ENOMEM. We could in theory
post a CQE for that condition too even if we never got a request. Leave
that for a potential followup.

Reported-by: Dylan Yudaken <dylany@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 12 +++++++++---
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3145c9cacee0..229b31d644ef 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7801,8 +7801,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		}
 		/* will complete beyond this point, count as submitted */
 		submitted++;
-		if (io_submit_sqe(ctx, req, sqe))
-			break;
+		if (io_submit_sqe(ctx, req, sqe)) {
+			/*
+			 * Continue submitting even for sqe failure if the
+			 * ring was setup with IORING_SETUP_SUBMIT_ALL
+			 */
+			if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+				break;
+		}
 	} while (submitted < nr);
 
 	if (unlikely(submitted != nr)) {
@@ -11265,7 +11271,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
-			IORING_SETUP_R_DISABLED))
+			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
 		return -EINVAL;
 
 	return  io_uring_create(entries, &p, params);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8bd4bfdd9a89..d2be4eb22008 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -101,6 +101,7 @@ enum {
 #define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
+#define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
 
 enum {
 	IORING_OP_NOP,
-- 
cgit v1.2.3


From 9bb984f28d5bcb917d35d930fcfb89f90f9449fd Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 9 Mar 2022 01:05:09 -0800
Subject: bpf: Remove BPF_SKB_DELIVERY_TIME_NONE and rename
 s/delivery_time_/tstamp_/

This patch is to simplify the uapi bpf.h regarding to the tstamp type
and use a similar way as the kernel to describe the value stored
in __sk_buff->tstamp.

My earlier thought was to avoid describing the semantic and
clock base for the rcv timestamp until there is more clarity
on the use case, so the __sk_buff->delivery_time_type naming instead
of __sk_buff->tstamp_type.

With some thoughts, it can reuse the UNSPEC naming.  This patch first
removes BPF_SKB_DELIVERY_TIME_NONE and also

rename BPF_SKB_DELIVERY_TIME_UNSPEC to BPF_SKB_TSTAMP_UNSPEC
and    BPF_SKB_DELIVERY_TIME_MONO   to BPF_SKB_TSTAMP_DELIVERY_MONO.

The semantic of BPF_SKB_TSTAMP_DELIVERY_MONO is the same:
__sk_buff->tstamp has delivery time in mono clock base.

BPF_SKB_TSTAMP_UNSPEC means __sk_buff->tstamp has the (rcv)
tstamp at ingress and the delivery time at egress.  At egress,
the clock base could be found from skb->sk->sk_clockid.
__sk_buff->tstamp == 0 naturally means NONE, so NONE is not needed.

With BPF_SKB_TSTAMP_UNSPEC for the rcv tstamp at ingress,
the __sk_buff->delivery_time_type is also renamed to __sk_buff->tstamp_type
which was also suggested in the earlier discussion:
https://lore.kernel.org/bpf/b181acbe-caf8-502d-4b7b-7d96b9fc5d55@iogearbox.net/

The above will then make __sk_buff->tstamp and __sk_buff->tstamp_type
the same as its kernel skb->tstamp and skb->mono_delivery_time
counter part.

The internal kernel function bpf_skb_convert_dtime_type_read() is then
renamed to bpf_skb_convert_tstamp_type_read() and it can be simplified
with the BPF_SKB_DELIVERY_TIME_NONE gone.  A BPF_ALU32_IMM(BPF_AND)
insn is also saved by using BPF_JMP32_IMM(BPF_JSET).

The bpf helper bpf_skb_set_delivery_time() is also renamed to
bpf_skb_set_tstamp().  The arg name is changed from dtime
to tstamp also.  It only allows setting tstamp 0 for
BPF_SKB_TSTAMP_UNSPEC and it could be relaxed later
if there is use case to change mono delivery time to
non mono.

prog->delivery_time_access is also renamed to prog->tstamp_type_access.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220309090509.3712315-1-kafai@fb.com
---
 include/linux/filter.h         |  2 +-
 include/uapi/linux/bpf.h       | 40 ++++++++++---------
 net/core/filter.c              | 88 ++++++++++++++++--------------------------
 tools/include/uapi/linux/bpf.h | 40 ++++++++++---------
 4 files changed, 77 insertions(+), 93 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9bf26307247f..05ed9bd31b45 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -573,7 +573,7 @@ struct bpf_prog {
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
 				call_get_func_ip:1, /* Do we call get_func_ip() */
-				delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */
+				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bc23020b638d..d288a0a9f797 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5090,23 +5090,22 @@ union bpf_attr {
  *		0 on success, or a negative error in case of failure. On error
  *		*dst* buffer is zeroed out.
  *
- * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type)
  *	Description
- *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
- *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *		Change the __sk_buff->tstamp_type to *tstamp_type*
+ *		and set *tstamp* to the __sk_buff->tstamp together.
  *
- *		When setting a delivery time (non zero *dtime*) to
- *		__sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
- *		is supported.  It is the only delivery_time_type that will be
- *		kept after bpf_redirect_*().
- *
- *		If there is no need to change the __sk_buff->delivery_time_type,
- *		the delivery time can be directly written to __sk_buff->tstamp
+ *		If there is no need to change the __sk_buff->tstamp_type,
+ *		the tstamp value can be directly written to __sk_buff->tstamp
  *		instead.
  *
- *		*dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
- *		can be used to clear any delivery time stored in
- *		__sk_buff->tstamp.
+ *		BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that
+ *		will be kept during bpf_redirect_*().  A non zero
+ *		*tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO
+ *		*tstamp_type*.
+ *
+ *		A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used
+ *		with a zero *tstamp*.
  *
  *		Only IPv4 and IPv6 skb->protocol are supported.
  *
@@ -5119,7 +5118,7 @@ union bpf_attr {
  *	Return
  *		0 on success.
  *		**-EINVAL** for invalid input
- *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
+ *		**-EOPNOTSUPP** for unsupported protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5314,7 +5313,7 @@ union bpf_attr {
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
-	FN(skb_set_delivery_time),      \
+	FN(skb_set_tstamp),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5505,9 +5504,12 @@ union {					\
 } __attribute__((aligned(8)))
 
 enum {
-	BPF_SKB_DELIVERY_TIME_NONE,
-	BPF_SKB_DELIVERY_TIME_UNSPEC,
-	BPF_SKB_DELIVERY_TIME_MONO,
+	BPF_SKB_TSTAMP_UNSPEC,
+	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
+	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
+	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
+	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+	 */
 };
 
 /* user accessible mirror of in-kernel sk_buff.
@@ -5550,7 +5552,7 @@ struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u8  delivery_time_type;
+	__u8  tstamp_type;
 	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index f914e4b13b18..03655f2074ae 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7388,36 +7388,36 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
-	   u64, dtime, u32, dtime_type)
+BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
+	   u64, tstamp, u32, tstamp_type)
 {
 	/* skb_clear_delivery_time() is done for inet protocol */
 	if (skb->protocol != htons(ETH_P_IP) &&
 	    skb->protocol != htons(ETH_P_IPV6))
 		return -EOPNOTSUPP;
 
-	switch (dtime_type) {
-	case BPF_SKB_DELIVERY_TIME_MONO:
-		if (!dtime)
+	switch (tstamp_type) {
+	case BPF_SKB_TSTAMP_DELIVERY_MONO:
+		if (!tstamp)
 			return -EINVAL;
-		skb->tstamp = dtime;
+		skb->tstamp = tstamp;
 		skb->mono_delivery_time = 1;
 		break;
-	case BPF_SKB_DELIVERY_TIME_NONE:
-		if (dtime)
+	case BPF_SKB_TSTAMP_UNSPEC:
+		if (tstamp)
 			return -EINVAL;
 		skb->tstamp = 0;
 		skb->mono_delivery_time = 0;
 		break;
 	default:
-		return -EOPNOTSUPP;
+		return -EINVAL;
 	}
 
 	return 0;
 }
 
-static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
-	.func           = bpf_skb_set_delivery_time,
+static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
+	.func           = bpf_skb_set_tstamp,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
@@ -7786,8 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_gen_syncookie_proto;
 	case BPF_FUNC_sk_assign:
 		return &bpf_sk_assign_proto;
-	case BPF_FUNC_skb_set_delivery_time:
-		return &bpf_skb_set_delivery_time_proto;
+	case BPF_FUNC_skb_set_tstamp:
+		return &bpf_skb_set_tstamp_proto;
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
@@ -8127,9 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 			return false;
 		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
 		break;
-	case offsetof(struct __sk_buff, delivery_time_type):
+	case offsetof(struct __sk_buff, tstamp_type):
 		return false;
-	case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+	case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
 		/* Explicitly prohibit access to padding in __sk_buff. */
 		return false;
 	default:
@@ -8484,14 +8484,14 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		break;
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
-	case offsetof(struct __sk_buff, delivery_time_type):
+	case offsetof(struct __sk_buff, tstamp_type):
 		/* The convert_ctx_access() on reading and writing
 		 * __sk_buff->tstamp depends on whether the bpf prog
-		 * has used __sk_buff->delivery_time_type or not.
-		 * Thus, we need to set prog->delivery_time_access
+		 * has used __sk_buff->tstamp_type or not.
+		 * Thus, we need to set prog->tstamp_type_access
 		 * earlier during is_valid_access() here.
 		 */
-		((struct bpf_prog *)prog)->delivery_time_access = 1;
+		((struct bpf_prog *)prog)->tstamp_type_access = 1;
 		return size == sizeof(__u8);
 	}
 
@@ -8888,42 +8888,22 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
-						    struct bpf_insn *insn)
+static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
+						     struct bpf_insn *insn)
 {
 	__u8 value_reg = si->dst_reg;
 	__u8 skb_reg = si->src_reg;
+	/* AX is needed because src_reg and dst_reg could be the same */
 	__u8 tmp_reg = BPF_REG_AX;
 
 	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
 			      PKT_VLAN_PRESENT_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
-				SKB_MONO_DELIVERY_TIME_MASK);
-	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
-	/* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
-	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
-
-	*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
-			      offsetof(struct sk_buff, tstamp));
-	*insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
-	/* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
-	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
-
-#ifdef CONFIG_NET_CLS_ACT
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
-	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
-	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
-	/* At ingress, value_reg = 0 */
-	*insn++ = BPF_MOV32_IMM(value_reg, 0);
+	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
+				SKB_MONO_DELIVERY_TIME_MASK, 2);
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
 	*insn++ = BPF_JMP_A(1);
-#endif
-
-	/* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
 
-	/* 15 insns with CONFIG_NET_CLS_ACT */
 	return insn;
 }
 
@@ -8956,11 +8936,11 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
 	__u8 skb_reg = si->src_reg;
 
 #ifdef CONFIG_NET_CLS_ACT
-	/* If the delivery_time_type is read,
+	/* If the tstamp_type is read,
 	 * the bpf prog is aware the tstamp could have delivery time.
-	 * Thus, read skb->tstamp as is if delivery_time_access is true.
+	 * Thus, read skb->tstamp as is if tstamp_type_access is true.
 	 */
-	if (!prog->delivery_time_access) {
+	if (!prog->tstamp_type_access) {
 		/* AX is needed because src_reg and dst_reg could be the same */
 		__u8 tmp_reg = BPF_REG_AX;
 
@@ -8990,13 +8970,13 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 	__u8 skb_reg = si->dst_reg;
 
 #ifdef CONFIG_NET_CLS_ACT
-	/* If the delivery_time_type is read,
+	/* If the tstamp_type is read,
 	 * the bpf prog is aware the tstamp could have delivery time.
-	 * Thus, write skb->tstamp as is if delivery_time_access is true.
+	 * Thus, write skb->tstamp as is if tstamp_type_access is true.
 	 * Otherwise, writing at ingress will have to clear the
 	 * mono_delivery_time bit also.
 	 */
-	if (!prog->delivery_time_access) {
+	if (!prog->tstamp_type_access) {
 		__u8 tmp_reg = BPF_REG_AX;
 
 		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
@@ -9329,8 +9309,8 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 			insn = bpf_convert_tstamp_read(prog, si, insn);
 		break;
 
-	case offsetof(struct __sk_buff, delivery_time_type):
-		insn = bpf_convert_dtime_type_read(si, insn);
+	case offsetof(struct __sk_buff, tstamp_type):
+		insn = bpf_convert_tstamp_type_read(si, insn);
 		break;
 
 	case offsetof(struct __sk_buff, gso_segs):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bc23020b638d..d288a0a9f797 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5090,23 +5090,22 @@ union bpf_attr {
  *		0 on success, or a negative error in case of failure. On error
  *		*dst* buffer is zeroed out.
  *
- * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type)
  *	Description
- *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
- *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *		Change the __sk_buff->tstamp_type to *tstamp_type*
+ *		and set *tstamp* to the __sk_buff->tstamp together.
  *
- *		When setting a delivery time (non zero *dtime*) to
- *		__sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
- *		is supported.  It is the only delivery_time_type that will be
- *		kept after bpf_redirect_*().
- *
- *		If there is no need to change the __sk_buff->delivery_time_type,
- *		the delivery time can be directly written to __sk_buff->tstamp
+ *		If there is no need to change the __sk_buff->tstamp_type,
+ *		the tstamp value can be directly written to __sk_buff->tstamp
  *		instead.
  *
- *		*dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
- *		can be used to clear any delivery time stored in
- *		__sk_buff->tstamp.
+ *		BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that
+ *		will be kept during bpf_redirect_*().  A non zero
+ *		*tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO
+ *		*tstamp_type*.
+ *
+ *		A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used
+ *		with a zero *tstamp*.
  *
  *		Only IPv4 and IPv6 skb->protocol are supported.
  *
@@ -5119,7 +5118,7 @@ union bpf_attr {
  *	Return
  *		0 on success.
  *		**-EINVAL** for invalid input
- *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
+ *		**-EOPNOTSUPP** for unsupported protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5314,7 +5313,7 @@ union bpf_attr {
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
-	FN(skb_set_delivery_time),      \
+	FN(skb_set_tstamp),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5505,9 +5504,12 @@ union {					\
 } __attribute__((aligned(8)))
 
 enum {
-	BPF_SKB_DELIVERY_TIME_NONE,
-	BPF_SKB_DELIVERY_TIME_UNSPEC,
-	BPF_SKB_DELIVERY_TIME_MONO,
+	BPF_SKB_TSTAMP_UNSPEC,
+	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
+	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
+	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
+	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+	 */
 };
 
 /* user accessible mirror of in-kernel sk_buff.
@@ -5550,7 +5552,7 @@ struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u8  delivery_time_type;
+	__u8  tstamp_type;
 	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
-- 
cgit v1.2.3


From 58617014405ad5c9f94f464444f4972dabb71ca7 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Thu, 10 Mar 2022 23:53:35 +0800
Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup()

Fix the descriptions of the return values of helper bpf_current_task_under_cgroup().

Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)")
Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220310155335.1278783-1-hengqi.chen@gmail.com
---
 include/uapi/linux/bpf.h       | 4 ++--
 tools/include/uapi/linux/bpf.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d288a0a9f797..e9978a916c3e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2302,8 +2302,8 @@ union bpf_attr {
  * 	Return
  * 		The return value depends on the result of the test, and can be:
  *
- *		* 0, if current task belongs to the cgroup2.
- *		* 1, if current task does not belong to the cgroup2.
+ *		* 1, if current task belongs to the cgroup2.
+ *		* 0, if current task does not belong to the cgroup2.
  * 		* A negative error code, if an error occurred.
  *
  * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d288a0a9f797..e9978a916c3e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2302,8 +2302,8 @@ union bpf_attr {
  * 	Return
  * 		The return value depends on the result of the test, and can be:
  *
- *		* 0, if current task belongs to the cgroup2.
- *		* 1, if current task does not belong to the cgroup2.
+ *		* 1, if current task belongs to the cgroup2.
+ *		* 0, if current task does not belong to the cgroup2.
  * 		* A negative error code, if an error occurred.
  *
  * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
-- 
cgit v1.2.3


From 174b16946e39ebd369097e0f773536c91a8c1a4c Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 2 Mar 2022 12:13:58 +0100
Subject: bpf-lsm: Introduce new helper bpf_ima_file_hash()

ima_file_hash() has been modified to calculate the measurement of a file on
demand, if it has not been already performed by IMA or the measurement is
not fresh. For compatibility reasons, ima_inode_hash() remains unchanged.

Keep the same approach in eBPF and introduce the new helper
bpf_ima_file_hash() to take advantage of the modified behavior of
ima_file_hash().

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220302111404.193900-4-roberto.sassu@huawei.com
---
 include/uapi/linux/bpf.h       | 11 +++++++++++
 kernel/bpf/bpf_lsm.c           | 20 ++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 11 +++++++++++
 3 files changed, 42 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e9978a916c3e..99fab54ae9c0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5119,6 +5119,16 @@ union bpf_attr {
  *		0 on success.
  *		**-EINVAL** for invalid input
  *		**-EOPNOTSUPP** for unsupported protocol
+ *
+ * long bpf_ima_file_hash(struct file *file, void *dst, u32 size)
+ *	Description
+ *		Returns a calculated IMA hash of the *file*.
+ *		If the hash is larger than *size*, then only *size*
+ *		bytes will be copied to *dst*
+ *	Return
+ *		The **hash_algo** is returned on success,
+ *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		invalid arguments are passed.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5314,6 +5324,7 @@ union bpf_attr {
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
 	FN(skb_set_tstamp),		\
+	FN(ima_file_hash),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9e4ecc990647..e8d27af5bbcc 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
 	.allowed	= bpf_ima_inode_hash_allowed,
 };
 
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+	return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+	.func		= bpf_ima_file_hash,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &bpf_ima_file_hash_btf_ids[0],
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.allowed	= bpf_ima_inode_hash_allowed,
+};
+
 static const struct bpf_func_proto *
 bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_bprm_opts_set_proto;
 	case BPF_FUNC_ima_inode_hash:
 		return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+	case BPF_FUNC_ima_file_hash:
+		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e9978a916c3e..99fab54ae9c0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5119,6 +5119,16 @@ union bpf_attr {
  *		0 on success.
  *		**-EINVAL** for invalid input
  *		**-EOPNOTSUPP** for unsupported protocol
+ *
+ * long bpf_ima_file_hash(struct file *file, void *dst, u32 size)
+ *	Description
+ *		Returns a calculated IMA hash of the *file*.
+ *		If the hash is larger than *size*, then only *size*
+ *		bytes will be copied to *dst*
+ *	Return
+ *		The **hash_algo** is returned on success,
+ *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		invalid arguments are passed.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5314,6 +5324,7 @@ union bpf_attr {
 	FN(xdp_store_bytes),		\
 	FN(copy_from_user_task),	\
 	FN(skb_set_tstamp),		\
+	FN(ima_file_hash),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 1926407a4ab0e59d5a27bed7b82029b356d80fa0 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Wed, 9 Mar 2022 23:20:33 +0100
Subject: net: openvswitch: fix uAPI incompatibility with existing user space

Few years ago OVS user space made a strange choice in the commit [1]
to define types only valid for the user space inside the copy of a
kernel uAPI header.  '#ifndef __KERNEL__' and another attribute was
added later.

This leads to the inevitable clash between user space and kernel types
when the kernel uAPI is extended.  The issue was unveiled with the
addition of a new type for IPv6 extension header in kernel uAPI.

When kernel provides the OVS_KEY_ATTR_IPV6_EXTHDRS attribute to the
older user space application, application tries to parse it as
OVS_KEY_ATTR_PACKET_TYPE and discards the whole netlink message as
malformed.  Since OVS_KEY_ATTR_IPV6_EXTHDRS is supplied along with
every IPv6 packet that goes to the user space, IPv6 support is fully
broken.

Fixing that by bringing these user space attributes to the kernel
uAPI to avoid the clash.  Strictly speaking this is not the problem
of the kernel uAPI, but changing it is the only way to avoid breakage
of the older user space applications at this point.

These 2 types are explicitly rejected now since they should not be
passed to the kernel.  Additionally, OVS_KEY_ATTR_TUNNEL_INFO moved
out from the '#ifdef __KERNEL__' as there is no good reason to hide
it from the userspace.  And it's also explicitly rejected now, because
it's for in-kernel use only.

Comments with warnings were added to avoid the problem coming back.

(1 << type) converted to (1ULL << type) to avoid integer overflow on
OVS_KEY_ATTR_IPV6_EXTHDRS, since it equals 32 now.

 [1] beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline")

Fixes: 28a3f0601727 ("net: openvswitch: IPv6: Add IPv6 extension header support")
Link: https://lore.kernel.org/netdev/3adf00c7-fe65-3ef4-b6d7-6d8a0cad8a5f@nvidia.com
Link: https://github.com/openvswitch/ovs/commit/beb75a40fdc295bfd6521b0068b4cd12f6de507c
Reported-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Aaron Conole <aconole@redhat.com>
Link: https://lore.kernel.org/r/20220309222033.3018976-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/openvswitch.h | 18 ++++++++++++++----
 net/openvswitch/flow_netlink.c   | 13 ++++++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 9d1710f20505..ce3e1738d427 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -351,11 +351,21 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
 	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
-	OVS_KEY_ATTR_IPV6_EXTHDRS,  /* struct ovs_key_ipv6_exthdr */
 
-#ifdef __KERNEL__
-	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
-#endif
+	/* User space decided to squat on types 29 and 30.  They are defined
+	 * below, but should not be sent to the kernel.
+	 *
+	 * WARNING: No new types should be added unless they are defined
+	 *          for both kernel and user space (no 'ifdef's).  It's hard
+	 *          to keep compatibility otherwise.
+	 */
+	OVS_KEY_ATTR_PACKET_TYPE,   /* be32 packet type */
+	OVS_KEY_ATTR_ND_EXTENSIONS, /* IPv6 Neighbor Discovery extensions */
+
+	OVS_KEY_ATTR_TUNNEL_INFO,   /* struct ip_tunnel_info.
+				     * For in-kernel use only.
+				     */
+	OVS_KEY_ATTR_IPV6_EXTHDRS,  /* struct ovs_key_ipv6_exthdr */
 	__OVS_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8b4124820f7d..5176f6ccac8e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -346,7 +346,7 @@ size_t ovs_key_attr_size(void)
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 30);
+	BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -482,7 +482,14 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
 			return -EINVAL;
 		}
 
-		if (attrs & (1 << type)) {
+		if (type == OVS_KEY_ATTR_PACKET_TYPE ||
+		    type == OVS_KEY_ATTR_ND_EXTENSIONS ||
+		    type == OVS_KEY_ATTR_TUNNEL_INFO) {
+			OVS_NLERR(log, "Key type %d is not supported", type);
+			return -EINVAL;
+		}
+
+		if (attrs & (1ULL << type)) {
 			OVS_NLERR(log, "Duplicate key (type %d).", type);
 			return -EINVAL;
 		}
@@ -495,7 +502,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
 		}
 
 		if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) {
-			attrs |= 1 << type;
+			attrs |= 1ULL << type;
 			a[type] = nla;
 		}
 	}
-- 
cgit v1.2.3


From 2916b7a9c7c25ecf9be2f37e567a277e861f8e3f Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Tue, 22 Feb 2022 20:36:39 +0530
Subject: nl80211: fix typo of NL80211_IF_TYPE_OCB in documentation

It should be NL80211_IFTYPE_OCB instead.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1645542399-4680-1-git-send-email-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 98ed52663d6b..0568a79097b8 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3259,7 +3259,7 @@ enum nl80211_attrs {
  *	and therefore can't be created in the normal ways, use the
  *	%NL80211_CMD_START_P2P_DEVICE and %NL80211_CMD_STOP_P2P_DEVICE
  *	commands to create and destroy one
- * @NL80211_IF_TYPE_OCB: Outside Context of a BSS
+ * @NL80211_IFTYPE_OCB: Outside Context of a BSS
  *	This mode corresponds to the MIB variable dot11OCBActivated=true
  * @NL80211_IFTYPE_NAN: NAN device interface type (not a netdev)
  * @NL80211_IFTYPE_MAX: highest interface type number currently defined
-- 
cgit v1.2.3


From b20dc3c684580ddc07eb48ee3c3dc7597cd5eebf Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:42 +0100
Subject: gtp: Allow to create GTP device without FDs

Currently, when the user wants to create GTP device, he has to
provide file handles to the sockets created in userspace (IFLA_GTP_FD0,
IFLA_GTP_FD1). This behaviour is not ideal, considering the option of
adding support for GTP device creation through ip link. Ip link
application is not a good place to create such sockets.

This patch allows to create GTP device without providing
IFLA_GTP_FD0 and IFLA_GTP_FD1 arguments. If the user sets
IFLA_GTP_CREATE_SOCKETS attribute, then GTP module takes care
of creating UDP sockets by itself. Sockets are created with the
commonly known UDP ports used for GTP protocol (GTP0_PORT and
GTP1U_PORT). In this case we don't have to provide encap_destroy
because no extra deinitialization is needed, everything is covered
by udp_tunnel_sock_release.

Note: GTP instance created with only this change applied, does
not handle GTP Echo Requests. This is implemented in the following
patch.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/gtp.c            | 101 +++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h |   1 +
 2 files changed, 85 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index bf087171bcf0..25d8521897b3 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -66,8 +66,10 @@ struct gtp_dev {
 
 	struct sock		*sk0;
 	struct sock		*sk1u;
+	u8			sk_created;
 
 	struct net_device	*dev;
+	struct net		*net;
 
 	unsigned int		role;
 	unsigned int		hash_size;
@@ -320,8 +322,16 @@ static void gtp_encap_disable_sock(struct sock *sk)
 
 static void gtp_encap_disable(struct gtp_dev *gtp)
 {
-	gtp_encap_disable_sock(gtp->sk0);
-	gtp_encap_disable_sock(gtp->sk1u);
+	if (gtp->sk_created) {
+		udp_tunnel_sock_release(gtp->sk0->sk_socket);
+		udp_tunnel_sock_release(gtp->sk1u->sk_socket);
+		gtp->sk_created = false;
+		gtp->sk0 = NULL;
+		gtp->sk1u = NULL;
+	} else {
+		gtp_encap_disable_sock(gtp->sk0);
+		gtp_encap_disable_sock(gtp->sk1u);
+	}
 }
 
 /* UDP encapsulation receive handler. See net/ipv4/udp.c.
@@ -656,17 +666,69 @@ static void gtp_destructor(struct net_device *dev)
 	kfree(gtp->tid_hash);
 }
 
+static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp)
+{
+	struct udp_tunnel_sock_cfg tuncfg = {};
+	struct udp_port_cfg udp_conf = {
+		.local_ip.s_addr	= htonl(INADDR_ANY),
+		.family			= AF_INET,
+	};
+	struct net *net = gtp->net;
+	struct socket *sock;
+	int err;
+
+	if (type == UDP_ENCAP_GTP0)
+		udp_conf.local_udp_port = htons(GTP0_PORT);
+	else if (type == UDP_ENCAP_GTP1U)
+		udp_conf.local_udp_port = htons(GTP1U_PORT);
+	else
+		return ERR_PTR(-EINVAL);
+
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err)
+		return ERR_PTR(err);
+
+	tuncfg.sk_user_data = gtp;
+	tuncfg.encap_type = type;
+	tuncfg.encap_rcv = gtp_encap_recv;
+	tuncfg.encap_destroy = NULL;
+
+	setup_udp_tunnel_sock(net, sock, &tuncfg);
+
+	return sock->sk;
+}
+
+static int gtp_create_sockets(struct gtp_dev *gtp, struct nlattr *data[])
+{
+	struct sock *sk1u = NULL;
+	struct sock *sk0 = NULL;
+
+	sk0 = gtp_create_sock(UDP_ENCAP_GTP0, gtp);
+	if (IS_ERR(sk0))
+		return PTR_ERR(sk0);
+
+	sk1u = gtp_create_sock(UDP_ENCAP_GTP1U, gtp);
+	if (IS_ERR(sk1u)) {
+		udp_tunnel_sock_release(sk0->sk_socket);
+		return PTR_ERR(sk1u);
+	}
+
+	gtp->sk_created = true;
+	gtp->sk0 = sk0;
+	gtp->sk1u = sk1u;
+
+	return 0;
+}
+
 static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[],
 		       struct netlink_ext_ack *extack)
 {
+	unsigned int role = GTP_ROLE_GGSN;
 	struct gtp_dev *gtp;
 	struct gtp_net *gn;
 	int hashsize, err;
 
-	if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1])
-		return -EINVAL;
-
 	gtp = netdev_priv(dev);
 
 	if (!data[IFLA_GTP_PDP_HASHSIZE]) {
@@ -677,11 +739,23 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 			hashsize = 1024;
 	}
 
+	if (data[IFLA_GTP_ROLE]) {
+		role = nla_get_u32(data[IFLA_GTP_ROLE]);
+		if (role > GTP_ROLE_SGSN)
+			return -EINVAL;
+	}
+	gtp->role = role;
+
+	gtp->net = src_net;
+
 	err = gtp_hashtable_new(gtp, hashsize);
 	if (err < 0)
 		return err;
 
-	err = gtp_encap_enable(gtp, data);
+	if (data[IFLA_GTP_CREATE_SOCKETS])
+		err = gtp_create_sockets(gtp, data);
+	else
+		err = gtp_encap_enable(gtp, data);
 	if (err < 0)
 		goto out_hashtable;
 
@@ -726,6 +800,7 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = {
 	[IFLA_GTP_FD1]			= { .type = NLA_U32 },
 	[IFLA_GTP_PDP_HASHSIZE]		= { .type = NLA_U32 },
 	[IFLA_GTP_ROLE]			= { .type = NLA_U32 },
+	[IFLA_GTP_CREATE_SOCKETS]	= { .type = NLA_U8 },
 };
 
 static int gtp_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -848,7 +923,9 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 {
 	struct sock *sk1u = NULL;
 	struct sock *sk0 = NULL;
-	unsigned int role = GTP_ROLE_GGSN;
+
+	if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1])
+		return -EINVAL;
 
 	if (data[IFLA_GTP_FD0]) {
 		u32 fd0 = nla_get_u32(data[IFLA_GTP_FD0]);
@@ -868,18 +945,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 		}
 	}
 
-	if (data[IFLA_GTP_ROLE]) {
-		role = nla_get_u32(data[IFLA_GTP_ROLE]);
-		if (role > GTP_ROLE_SGSN) {
-			gtp_encap_disable_sock(sk0);
-			gtp_encap_disable_sock(sk1u);
-			return -EINVAL;
-		}
-	}
-
 	gtp->sk0 = sk0;
 	gtp->sk1u = sk1u;
-	gtp->role = role;
 
 	return 0;
 }
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ddca20357e7e..ebd2aa3ef809 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -887,6 +887,7 @@ enum {
 	IFLA_GTP_FD1,
 	IFLA_GTP_PDP_HASHSIZE,
 	IFLA_GTP_ROLE,
+	IFLA_GTP_CREATE_SOCKETS,
 	__IFLA_GTP_MAX,
 };
 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
-- 
cgit v1.2.3


From 9af41cc33471ea1efa6f77e188f055cc77d0a5c5 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:43 +0100
Subject: gtp: Implement GTP echo response

Adding GTP device through ip link creates the situation where
there is no userspace daemon which would handle GTP messages
(Echo Request for example). GTP-U instance which would not respond
to echo requests would violate GTP specification.

When GTP packet arrives with GTP_ECHO_REQ message type,
GTP_ECHO_RSP is send to the sender. GTP_ECHO_RSP message
should contain information element with GTPIE_RECOVERY tag and
restart counter value. For GTPv1 restart counter is not used
and should be equal to 0, for GTPv0 restart counter contains
information provided from userspace(IFLA_GTP_RESTART_COUNT).

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Suggested-by: Harald Welte <laforge@gnumonks.org>
Reviewed-by: Harald Welte <laforge@gnumonks.org>
Tested-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/gtp.c            | 213 +++++++++++++++++++++++++++++++++++++++----
 include/net/gtp.h            |  31 +++++++
 include/uapi/linux/if_link.h |   1 +
 3 files changed, 229 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 25d8521897b3..bf434d79f868 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -75,6 +75,8 @@ struct gtp_dev {
 	unsigned int		hash_size;
 	struct hlist_head	*tid_hash;
 	struct hlist_head	*addr_hash;
+
+	u8			restart_count;
 };
 
 static unsigned int gtp_net_id __read_mostly;
@@ -217,6 +219,106 @@ err:
 	return -1;
 }
 
+static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
+					   const struct sock *sk,
+					   __be32 daddr, __be32 saddr)
+{
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->flowi4_oif		= sk->sk_bound_dev_if;
+	fl4->daddr		= daddr;
+	fl4->saddr		= saddr;
+	fl4->flowi4_tos		= RT_CONN_FLAGS(sk);
+	fl4->flowi4_proto	= sk->sk_protocol;
+
+	return ip_route_output_key(sock_net(sk), fl4);
+}
+
+/* GSM TS 09.60. 7.3
+ * In all Path Management messages:
+ * - TID: is not used and shall be set to 0.
+ * - Flow Label is not used and shall be set to 0
+ * In signalling messages:
+ * - number: this field is not yet used in signalling messages.
+ *   It shall be set to 255 by the sender and shall be ignored
+ *   by the receiver
+ * Returns true if the echo req was correct, false otherwise.
+ */
+static bool gtp0_validate_echo_req(struct gtp0_header *gtp0)
+{
+	return !(gtp0->tid || (gtp0->flags ^ 0x1e) ||
+		gtp0->number != 0xff || gtp0->flow);
+}
+
+static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
+{
+	struct gtp0_packet *gtp_pkt;
+	struct gtp0_header *gtp0;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	struct iphdr *iph;
+	__be16 seq;
+
+	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
+
+	if (!gtp0_validate_echo_req(gtp0))
+		return -1;
+
+	seq = gtp0->seq;
+
+	/* pull GTP and UDP headers */
+	skb_pull_data(skb, sizeof(struct gtp0_header) + sizeof(struct udphdr));
+
+	gtp_pkt = skb_push(skb, sizeof(struct gtp0_packet));
+	memset(gtp_pkt, 0, sizeof(struct gtp0_packet));
+
+	gtp_pkt->gtp0_h.flags = 0x1e; /* v0, GTP-non-prime. */
+	gtp_pkt->gtp0_h.type = GTP_ECHO_RSP;
+	gtp_pkt->gtp0_h.length =
+		htons(sizeof(struct gtp0_packet) - sizeof(struct gtp0_header));
+
+	/* GSM TS 09.60. 7.3 The Sequence Number in a signalling response
+	 * message shall be copied from the signalling request message
+	 * that the GSN is replying to.
+	 */
+	gtp_pkt->gtp0_h.seq = seq;
+
+	/* GSM TS 09.60. 7.3 In all Path Management Flow Label and TID
+	 * are not used and shall be set to 0.
+	 */
+	gtp_pkt->gtp0_h.flow = 0;
+	gtp_pkt->gtp0_h.tid = 0;
+	gtp_pkt->gtp0_h.number = 0xff;
+	gtp_pkt->gtp0_h.spare[0] = 0xff;
+	gtp_pkt->gtp0_h.spare[1] = 0xff;
+	gtp_pkt->gtp0_h.spare[2] = 0xff;
+
+	gtp_pkt->ie.tag = GTPIE_RECOVERY;
+	gtp_pkt->ie.val = gtp->restart_count;
+
+	iph = ip_hdr(skb);
+
+	/* find route to the sender,
+	 * src address becomes dst address and vice versa.
+	 */
+	rt = ip4_route_output_gtp(&fl4, gtp->sk0, iph->saddr, iph->daddr);
+	if (IS_ERR(rt)) {
+		netdev_dbg(gtp->dev, "no route for echo response from %pI4\n",
+			   &iph->saddr);
+		return -1;
+	}
+
+	udp_tunnel_xmit_skb(rt, gtp->sk0, skb,
+			    fl4.saddr, fl4.daddr,
+			    iph->tos,
+			    ip4_dst_hoplimit(&rt->dst),
+			    0,
+			    htons(GTP0_PORT), htons(GTP0_PORT),
+			    !net_eq(sock_net(gtp->sk1u),
+				    dev_net(gtp->dev)),
+			    false);
+	return 0;
+}
+
 /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */
 static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 {
@@ -233,6 +335,13 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if ((gtp0->flags >> 5) != GTP_V0)
 		return 1;
 
+	/* If the sockets were created in kernel, it means that
+	 * there is no daemon running in userspace which would
+	 * handle echo request.
+	 */
+	if (gtp0->type == GTP_ECHO_REQ && gtp->sk_created)
+		return gtp0_send_echo_resp(gtp, skb);
+
 	if (gtp0->type != GTP_TPDU)
 		return 1;
 
@@ -245,6 +354,75 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
 }
 
+static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
+{
+	struct gtp1_header_long *gtp1u;
+	struct gtp1u_packet *gtp_pkt;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	struct iphdr *iph;
+
+	gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr));
+
+	/* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response,
+	 * Error Indication and Supported Extension Headers Notification
+	 * messages, the S flag shall be set to 1 and TEID shall be set to 0.
+	 */
+	if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid)
+		return -1;
+
+	/* pull GTP and UDP headers */
+	skb_pull_data(skb,
+		      sizeof(struct gtp1_header_long) + sizeof(struct udphdr));
+
+	gtp_pkt = skb_push(skb, sizeof(struct gtp1u_packet));
+	memset(gtp_pkt, 0, sizeof(struct gtp1u_packet));
+
+	/* S flag must be set to 1 */
+	gtp_pkt->gtp1u_h.flags = 0x32;
+	gtp_pkt->gtp1u_h.type = GTP_ECHO_RSP;
+	/* seq, npdu and next should be counted to the length of the GTP packet
+	 * that's why szie of gtp1_header should be subtracted,
+	 * not why szie of gtp1_header_long.
+	 */
+	gtp_pkt->gtp1u_h.length =
+		htons(sizeof(struct gtp1u_packet) - sizeof(struct gtp1_header));
+	/* 3GPP TS 29.281 5.1 - TEID has to be set to 0 */
+	gtp_pkt->gtp1u_h.tid = 0;
+
+	/* 3GPP TS 29.281 7.7.2 - The Restart Counter value in the
+	 * Recovery information element shall not be used, i.e. it shall
+	 * be set to zero by the sender and shall be ignored by the receiver.
+	 * The Recovery information element is mandatory due to backwards
+	 * compatibility reasons.
+	 */
+	gtp_pkt->ie.tag = GTPIE_RECOVERY;
+	gtp_pkt->ie.val = 0;
+
+	iph = ip_hdr(skb);
+
+	/* find route to the sender,
+	 * src address becomes dst address and vice versa.
+	 */
+	rt = ip4_route_output_gtp(&fl4, gtp->sk1u, iph->saddr, iph->daddr);
+	if (IS_ERR(rt)) {
+		netdev_dbg(gtp->dev, "no route for echo response from %pI4\n",
+			   &iph->saddr);
+		return -1;
+	}
+
+	udp_tunnel_xmit_skb(rt, gtp->sk1u, skb,
+			    fl4.saddr, fl4.daddr,
+			    iph->tos,
+			    ip4_dst_hoplimit(&rt->dst),
+			    0,
+			    htons(GTP1U_PORT), htons(GTP1U_PORT),
+			    !net_eq(sock_net(gtp->sk1u),
+				    dev_net(gtp->dev)),
+			    false);
+	return 0;
+}
+
 static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 {
 	unsigned int hdrlen = sizeof(struct udphdr) +
@@ -260,6 +438,13 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if ((gtp1->flags >> 5) != GTP_V1)
 		return 1;
 
+	/* If the sockets were created in kernel, it means that
+	 * there is no daemon running in userspace which would
+	 * handle echo request.
+	 */
+	if (gtp1->type == GTP_ECHO_REQ && gtp->sk_created)
+		return gtp1u_send_echo_resp(gtp, skb);
+
 	if (gtp1->type != GTP_TPDU)
 		return 1;
 
@@ -398,20 +583,6 @@ static void gtp_dev_uninit(struct net_device *dev)
 	free_percpu(dev->tstats);
 }
 
-static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
-					   const struct sock *sk,
-					   __be32 daddr)
-{
-	memset(fl4, 0, sizeof(*fl4));
-	fl4->flowi4_oif		= sk->sk_bound_dev_if;
-	fl4->daddr		= daddr;
-	fl4->saddr		= inet_sk(sk)->inet_saddr;
-	fl4->flowi4_tos		= RT_CONN_FLAGS(sk);
-	fl4->flowi4_proto	= sk->sk_protocol;
-
-	return ip_route_output_key(sock_net(sk), fl4);
-}
-
 static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 {
 	int payload_len = skb->len;
@@ -517,7 +688,8 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	}
 	netdev_dbg(dev, "found PDP context %p\n", pctx);
 
-	rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr);
+	rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr,
+				  inet_sk(pctx->sk)->inet_saddr);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to SSGN %pI4\n",
 			   &pctx->peer_addr_ip4.s_addr);
@@ -746,6 +918,11 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 	}
 	gtp->role = role;
 
+	if (!data[IFLA_GTP_RESTART_COUNT])
+		gtp->restart_count = 0;
+	else
+		gtp->restart_count = nla_get_u8(data[IFLA_GTP_RESTART_COUNT]);
+
 	gtp->net = src_net;
 
 	err = gtp_hashtable_new(gtp, hashsize);
@@ -801,6 +978,7 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = {
 	[IFLA_GTP_PDP_HASHSIZE]		= { .type = NLA_U32 },
 	[IFLA_GTP_ROLE]			= { .type = NLA_U32 },
 	[IFLA_GTP_CREATE_SOCKETS]	= { .type = NLA_U8 },
+	[IFLA_GTP_RESTART_COUNT]	= { .type = NLA_U8 },
 };
 
 static int gtp_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -815,7 +993,8 @@ static int gtp_validate(struct nlattr *tb[], struct nlattr *data[],
 static size_t gtp_get_size(const struct net_device *dev)
 {
 	return nla_total_size(sizeof(__u32)) + /* IFLA_GTP_PDP_HASHSIZE */
-		nla_total_size(sizeof(__u32)); /* IFLA_GTP_ROLE */
+		nla_total_size(sizeof(__u32)) + /* IFLA_GTP_ROLE */
+		nla_total_size(sizeof(__u8)); /* IFLA_GTP_RESTART_COUNT */
 }
 
 static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev)
@@ -826,6 +1005,8 @@ static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		goto nla_put_failure;
 	if (nla_put_u32(skb, IFLA_GTP_ROLE, gtp->role))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, IFLA_GTP_RESTART_COUNT, gtp->restart_count))
+		goto nla_put_failure;
 
 	return 0;
 
diff --git a/include/net/gtp.h b/include/net/gtp.h
index 0e16ebb2a82d..0e12c37f2958 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -7,8 +7,13 @@
 #define GTP0_PORT	3386
 #define GTP1U_PORT	2152
 
+/* GTP messages types */
+#define GTP_ECHO_REQ	1	/* Echo Request */
+#define GTP_ECHO_RSP	2	/* Echo Response */
 #define GTP_TPDU	255
 
+#define GTPIE_RECOVERY	14
+
 struct gtp0_header {	/* According to GSM TS 09.60. */
 	__u8	flags;
 	__u8	type;
@@ -27,6 +32,32 @@ struct gtp1_header {	/* According to 3GPP TS 29.060. */
 	__be32	tid;
 } __attribute__ ((packed));
 
+struct gtp1_header_long {	/* According to 3GPP TS 29.060. */
+	__u8	flags;
+	__u8	type;
+	__be16	length;
+	__be32	tid;
+	__be16	seq;
+	__u8	npdu;
+	__u8	next;
+} __packed;
+
+/* GTP Information Element */
+struct gtp_ie {
+	__u8	tag;
+	__u8	val;
+} __packed;
+
+struct gtp0_packet {
+	struct gtp0_header gtp0_h;
+	struct gtp_ie ie;
+} __packed;
+
+struct gtp1u_packet {
+	struct gtp1_header_long gtp1u_h;
+	struct gtp_ie ie;
+} __packed;
+
 #define GTP1_F_NPDU	0x01
 #define GTP1_F_SEQ	0x02
 #define GTP1_F_EXTHDR	0x04
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ebd2aa3ef809..bd24c7dc10a2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -888,6 +888,7 @@ enum {
 	IFLA_GTP_PDP_HASHSIZE,
 	IFLA_GTP_ROLE,
 	IFLA_GTP_CREATE_SOCKETS,
+	IFLA_GTP_RESTART_COUNT,
 	__IFLA_GTP_MAX,
 };
 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
-- 
cgit v1.2.3


From d33bd757d362699cfce3c68b53cd12b947d196f4 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:44 +0100
Subject: gtp: Implement GTP echo request

Adding GTP device through ip link creates the situation where
GTP instance is not able to send GTP echo requests.
Echo requests are used to check if GTP peer is still alive.
With this patch, gtp_genl_ops are extended by new cmd (GTP_CMD_ECHOREQ)
which allows to send echo request in the given version of GTP
protocol (v0 or v1), from the given ms address to he given
peer. TID is not inclued because in all path management
messages it should be equal to 0.

When GTP echo response is detected, multicast message is
send to everyone in the gtp_genl_family. Message contains
GTP version, ms address and peer address.

Suggested-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Reviewed-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/gtp.c        | 305 +++++++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/gtp.h |   1 +
 2 files changed, 269 insertions(+), 37 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index bf434d79f868..756714d4ad92 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -79,6 +79,12 @@ struct gtp_dev {
 	u8			restart_count;
 };
 
+struct echo_info {
+	struct in_addr		ms_addr_ip4;
+	struct in_addr		peer_addr_ip4;
+	u8			gtp_version;
+};
+
 static unsigned int gtp_net_id __read_mostly;
 
 struct gtp_net {
@@ -87,6 +93,16 @@ struct gtp_net {
 
 static u32 gtp_h_initval;
 
+static struct genl_family gtp_genl_family;
+
+enum gtp_multicast_groups {
+	GTP_GENL_MCGRP,
+};
+
+static const struct genl_multicast_group gtp_genl_mcgrps[] = {
+	[GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME },
+};
+
 static void pdp_context_delete(struct pdp_ctx *pctx);
 
 static inline u32 gtp0_hashfn(u64 tid)
@@ -243,12 +259,38 @@ static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
  *   by the receiver
  * Returns true if the echo req was correct, false otherwise.
  */
-static bool gtp0_validate_echo_req(struct gtp0_header *gtp0)
+static bool gtp0_validate_echo_hdr(struct gtp0_header *gtp0)
 {
 	return !(gtp0->tid || (gtp0->flags ^ 0x1e) ||
 		gtp0->number != 0xff || gtp0->flow);
 }
 
+/* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */
+static void gtp0_build_echo_msg(struct gtp0_header *hdr, __u8 msg_type)
+{
+	int len_pkt, len_hdr;
+
+	hdr->flags = 0x1e; /* v0, GTP-non-prime. */
+	hdr->type = msg_type;
+	/* GSM TS 09.60. 7.3 In all Path Management Flow Label and TID
+	 * are not used and shall be set to 0.
+	 */
+	hdr->flow = 0;
+	hdr->tid = 0;
+	hdr->number = 0xff;
+	hdr->spare[0] = 0xff;
+	hdr->spare[1] = 0xff;
+	hdr->spare[2] = 0xff;
+
+	len_pkt = sizeof(struct gtp0_packet);
+	len_hdr = sizeof(struct gtp0_header);
+
+	if (msg_type == GTP_ECHO_RSP)
+		hdr->length = htons(len_pkt - len_hdr);
+	else
+		hdr->length = 0;
+}
+
 static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 {
 	struct gtp0_packet *gtp_pkt;
@@ -260,7 +302,7 @@ static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 
 	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
 
-	if (!gtp0_validate_echo_req(gtp0))
+	if (!gtp0_validate_echo_hdr(gtp0))
 		return -1;
 
 	seq = gtp0->seq;
@@ -271,10 +313,7 @@ static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 	gtp_pkt = skb_push(skb, sizeof(struct gtp0_packet));
 	memset(gtp_pkt, 0, sizeof(struct gtp0_packet));
 
-	gtp_pkt->gtp0_h.flags = 0x1e; /* v0, GTP-non-prime. */
-	gtp_pkt->gtp0_h.type = GTP_ECHO_RSP;
-	gtp_pkt->gtp0_h.length =
-		htons(sizeof(struct gtp0_packet) - sizeof(struct gtp0_header));
+	gtp0_build_echo_msg(&gtp_pkt->gtp0_h, GTP_ECHO_RSP);
 
 	/* GSM TS 09.60. 7.3 The Sequence Number in a signalling response
 	 * message shall be copied from the signalling request message
@@ -282,16 +321,6 @@ static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 	 */
 	gtp_pkt->gtp0_h.seq = seq;
 
-	/* GSM TS 09.60. 7.3 In all Path Management Flow Label and TID
-	 * are not used and shall be set to 0.
-	 */
-	gtp_pkt->gtp0_h.flow = 0;
-	gtp_pkt->gtp0_h.tid = 0;
-	gtp_pkt->gtp0_h.number = 0xff;
-	gtp_pkt->gtp0_h.spare[0] = 0xff;
-	gtp_pkt->gtp0_h.spare[1] = 0xff;
-	gtp_pkt->gtp0_h.spare[2] = 0xff;
-
 	gtp_pkt->ie.tag = GTPIE_RECOVERY;
 	gtp_pkt->ie.val = gtp->restart_count;
 
@@ -319,6 +348,61 @@ static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 	return 0;
 }
 
+static int gtp_genl_fill_echo(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
+			      int flags, u32 type, struct echo_info echo)
+{
+	void *genlh;
+
+	genlh = genlmsg_put(skb, snd_portid, snd_seq, &gtp_genl_family, flags,
+			    type);
+	if (!genlh)
+		goto failure;
+
+	if (nla_put_u32(skb, GTPA_VERSION, echo.gtp_version) ||
+	    nla_put_be32(skb, GTPA_PEER_ADDRESS, echo.peer_addr_ip4.s_addr) ||
+	    nla_put_be32(skb, GTPA_MS_ADDRESS, echo.ms_addr_ip4.s_addr))
+		goto failure;
+
+	genlmsg_end(skb, genlh);
+	return 0;
+
+failure:
+	genlmsg_cancel(skb, genlh);
+	return -EMSGSIZE;
+}
+
+static int gtp0_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
+{
+	struct gtp0_header *gtp0;
+	struct echo_info echo;
+	struct sk_buff *msg;
+	struct iphdr *iph;
+	int ret;
+
+	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
+
+	if (!gtp0_validate_echo_hdr(gtp0))
+		return -1;
+
+	iph = ip_hdr(skb);
+	echo.ms_addr_ip4.s_addr = iph->daddr;
+	echo.peer_addr_ip4.s_addr = iph->saddr;
+	echo.gtp_version = GTP_V0;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	return genlmsg_multicast_netns(&gtp_genl_family, dev_net(gtp->dev),
+				       msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC);
+}
+
 /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */
 static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 {
@@ -342,6 +426,9 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp0->type == GTP_ECHO_REQ && gtp->sk_created)
 		return gtp0_send_echo_resp(gtp, skb);
 
+	if (gtp0->type == GTP_ECHO_RSP && gtp->sk_created)
+		return gtp0_handle_echo_resp(gtp, skb);
+
 	if (gtp0->type != GTP_TPDU)
 		return 1;
 
@@ -354,6 +441,36 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
 }
 
+/* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */
+static void gtp1u_build_echo_msg(struct gtp1_header_long *hdr, __u8 msg_type)
+{
+	int len_pkt, len_hdr;
+
+	/* S flag must be set to 1 */
+	hdr->flags = 0x32; /* v1, GTP-non-prime. */
+	hdr->type = msg_type;
+	/* 3GPP TS 29.281 5.1 - TEID has to be set to 0 */
+	hdr->tid = 0;
+
+	/* seq, npdu and next should be counted to the length of the GTP packet
+	 * that's why szie of gtp1_header should be subtracted,
+	 * not size of gtp1_header_long.
+	 */
+
+	len_hdr = sizeof(struct gtp1_header);
+
+	if (msg_type == GTP_ECHO_RSP) {
+		len_pkt = sizeof(struct gtp1u_packet);
+		hdr->length = htons(len_pkt - len_hdr);
+	} else {
+		/* GTP_ECHO_REQ does not carry GTP Information Element,
+		 * the why gtp1_header_long is used here.
+		 */
+		len_pkt = sizeof(struct gtp1_header_long);
+		hdr->length = htons(len_pkt - len_hdr);
+	}
+}
+
 static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 {
 	struct gtp1_header_long *gtp1u;
@@ -378,17 +495,7 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 	gtp_pkt = skb_push(skb, sizeof(struct gtp1u_packet));
 	memset(gtp_pkt, 0, sizeof(struct gtp1u_packet));
 
-	/* S flag must be set to 1 */
-	gtp_pkt->gtp1u_h.flags = 0x32;
-	gtp_pkt->gtp1u_h.type = GTP_ECHO_RSP;
-	/* seq, npdu and next should be counted to the length of the GTP packet
-	 * that's why szie of gtp1_header should be subtracted,
-	 * not why szie of gtp1_header_long.
-	 */
-	gtp_pkt->gtp1u_h.length =
-		htons(sizeof(struct gtp1u_packet) - sizeof(struct gtp1_header));
-	/* 3GPP TS 29.281 5.1 - TEID has to be set to 0 */
-	gtp_pkt->gtp1u_h.tid = 0;
+	gtp1u_build_echo_msg(&gtp_pkt->gtp1u_h, GTP_ECHO_RSP);
 
 	/* 3GPP TS 29.281 7.7.2 - The Restart Counter value in the
 	 * Recovery information element shall not be used, i.e. it shall
@@ -423,6 +530,42 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 	return 0;
 }
 
+static int gtp1u_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
+{
+	struct gtp1_header_long *gtp1u;
+	struct echo_info echo;
+	struct sk_buff *msg;
+	struct iphdr *iph;
+	int ret;
+
+	gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr));
+
+	/* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response,
+	 * Error Indication and Supported Extension Headers Notification
+	 * messages, the S flag shall be set to 1 and TEID shall be set to 0.
+	 */
+	if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid)
+		return -1;
+
+	iph = ip_hdr(skb);
+	echo.ms_addr_ip4.s_addr = iph->daddr;
+	echo.peer_addr_ip4.s_addr = iph->saddr;
+	echo.gtp_version = GTP_V1;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	return genlmsg_multicast_netns(&gtp_genl_family, dev_net(gtp->dev),
+				       msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC);
+}
+
 static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 {
 	unsigned int hdrlen = sizeof(struct udphdr) +
@@ -445,6 +588,9 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp1->type == GTP_ECHO_REQ && gtp->sk_created)
 		return gtp1u_send_echo_resp(gtp, skb);
 
+	if (gtp1->type == GTP_ECHO_RSP && gtp->sk_created)
+		return gtp1u_handle_echo_resp(gtp, skb);
+
 	if (gtp1->type != GTP_TPDU)
 		return 1;
 
@@ -1431,16 +1577,6 @@ out_unlock:
 	return err;
 }
 
-static struct genl_family gtp_genl_family;
-
-enum gtp_multicast_groups {
-	GTP_GENL_MCGRP,
-};
-
-static const struct genl_multicast_group gtp_genl_mcgrps[] = {
-	[GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME },
-};
-
 static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 			      int flags, u32 type, struct pdp_ctx *pctx)
 {
@@ -1584,6 +1720,95 @@ out:
 	return skb->len;
 }
 
+static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *skb_to_send;
+	__be32 src_ip, dst_ip;
+	unsigned int version;
+	struct gtp_dev *gtp;
+	struct flowi4 fl4;
+	struct rtable *rt;
+	struct sock *sk;
+	__be16 port;
+	int len;
+
+	if (!info->attrs[GTPA_VERSION] ||
+	    !info->attrs[GTPA_LINK] ||
+	    !info->attrs[GTPA_PEER_ADDRESS] ||
+	    !info->attrs[GTPA_MS_ADDRESS])
+		return -EINVAL;
+
+	version = nla_get_u32(info->attrs[GTPA_VERSION]);
+	dst_ip = nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
+	src_ip = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
+
+	gtp = gtp_find_dev(sock_net(skb->sk), info->attrs);
+	if (!gtp)
+		return -ENODEV;
+
+	if (!gtp->sk_created)
+		return -EOPNOTSUPP;
+	if (!(gtp->dev->flags & IFF_UP))
+		return -ENETDOWN;
+
+	if (version == GTP_V0) {
+		struct gtp0_header *gtp0_h;
+
+		len = LL_RESERVED_SPACE(gtp->dev) + sizeof(struct gtp0_header) +
+			sizeof(struct iphdr) + sizeof(struct udphdr);
+
+		skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len);
+		if (!skb_to_send)
+			return -ENOMEM;
+
+		sk = gtp->sk0;
+		port = htons(GTP0_PORT);
+
+		gtp0_h = skb_push(skb_to_send, sizeof(struct gtp0_header));
+		memset(gtp0_h, 0, sizeof(struct gtp0_header));
+		gtp0_build_echo_msg(gtp0_h, GTP_ECHO_REQ);
+	} else if (version == GTP_V1) {
+		struct gtp1_header_long *gtp1u_h;
+
+		len = LL_RESERVED_SPACE(gtp->dev) +
+			sizeof(struct gtp1_header_long) +
+			sizeof(struct iphdr) + sizeof(struct udphdr);
+
+		skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len);
+		if (!skb_to_send)
+			return -ENOMEM;
+
+		sk = gtp->sk1u;
+		port = htons(GTP1U_PORT);
+
+		gtp1u_h = skb_push(skb_to_send,
+				   sizeof(struct gtp1_header_long));
+		memset(gtp1u_h, 0, sizeof(struct gtp1_header_long));
+		gtp1u_build_echo_msg(gtp1u_h, GTP_ECHO_REQ);
+	} else {
+		return -ENODEV;
+	}
+
+	rt = ip4_route_output_gtp(&fl4, sk, dst_ip, src_ip);
+	if (IS_ERR(rt)) {
+		netdev_dbg(gtp->dev, "no route for echo request to %pI4\n",
+			   &dst_ip);
+			   kfree_skb(skb_to_send);
+		return -ENODEV;
+	}
+
+	udp_tunnel_xmit_skb(rt, sk, skb_to_send,
+			    fl4.saddr, fl4.daddr,
+			    fl4.flowi4_tos,
+			    ip4_dst_hoplimit(&rt->dst),
+			    0,
+			    port, port,
+			    !net_eq(sock_net(sk),
+				    dev_net(gtp->dev)),
+			    false);
+	return 0;
+}
+
 static const struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_LINK]		= { .type = NLA_U32, },
 	[GTPA_VERSION]		= { .type = NLA_U32, },
@@ -1616,6 +1841,12 @@ static const struct genl_small_ops gtp_genl_ops[] = {
 		.dumpit = gtp_genl_dump_pdp,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = GTP_CMD_ECHOREQ,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = gtp_genl_send_echo_req,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family gtp_genl_family __ro_after_init = {
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index 79f9191bbb24..2f61298a7b77 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -8,6 +8,7 @@ enum gtp_genl_cmds {
 	GTP_CMD_NEWPDP,
 	GTP_CMD_DELPDP,
 	GTP_CMD_GETPDP,
+	GTP_CMD_ECHOREQ,
 
 	GTP_CMD_MAX,
 };
-- 
cgit v1.2.3


From e3acda7ade0a36c5cbebc2b54d30b7f08a4ba29b Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:45 +0100
Subject: net/sched: Allow flower to match on GTP options

Options are as follows: PDU_TYPE:QFI and they refernce to
the fields from the  PDU Session Protocol. PDU Session data
is conveyed in GTP-U Extension Header.

GTP-U Extension Header is described in 3GPP TS 29.281.
PDU Session Protocol is described in 3GPP TS 38.415.

PDU_TYPE -  indicates the type of the PDU Session Information (4 bits)
QFI      -  QoS Flow Identifier (6 bits)

  # ip link add gtp_dev type gtp role sgsn
  # tc qdisc add dev gtp_dev ingress
  # tc filter add dev gtp_dev protocol ip parent ffff: \
      flower \
        enc_key_id 11 \
        gtp_opts 1:8/ff:ff \
      action mirred egress redirect dev eth0

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/gtp.h              |   5 ++
 include/uapi/linux/if_tunnel.h |   4 +-
 include/uapi/linux/pkt_cls.h   |  15 ++++++
 net/sched/cls_flower.c         | 116 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/gtp.h b/include/net/gtp.h
index 0e12c37f2958..23c2aaae8a42 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -58,6 +58,11 @@ struct gtp1u_packet {
 	struct gtp_ie ie;
 } __packed;
 
+struct gtp_pdu_session_info {	/* According to 3GPP TS 38.415. */
+	u8	pdu_type;
+	u8	qfi;
+};
+
 #define GTP1_F_NPDU	0x01
 #define GTP1_F_SEQ	0x02
 #define GTP1_F_EXTHDR	0x04
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 7d9105533c7b..102119628ff5 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -176,8 +176,10 @@ enum {
 #define TUNNEL_VXLAN_OPT	__cpu_to_be16(0x1000)
 #define TUNNEL_NOCACHE		__cpu_to_be16(0x2000)
 #define TUNNEL_ERSPAN_OPT	__cpu_to_be16(0x4000)
+#define TUNNEL_GTP_OPT		__cpu_to_be16(0x8000)
 
 #define TUNNEL_OPTIONS_PRESENT \
-		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT)
+		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \
+		TUNNEL_GTP_OPT)
 
 #endif /* _UAPI_IF_TUNNEL_H_ */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index ee38b35c3f57..404f97fb239c 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -616,6 +616,10 @@ enum {
 					 * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_
 					 * attributes
 					 */
+	TCA_FLOWER_KEY_ENC_OPTS_GTP,	/* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_GTP_
+					 * attributes
+					 */
 	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
 };
 
@@ -654,6 +658,17 @@ enum {
 #define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \
 		(__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_GTP_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE,		/* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_GTP_QFI,			/* u8 */
+
+	__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_GTP_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC,
 	TCA_FLOWER_KEY_MPLS_OPTS_LSE,
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1a9b1f140f9e..c80fc49c0da1 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -25,6 +25,7 @@
 #include <net/geneve.h>
 #include <net/vxlan.h>
 #include <net/erspan.h>
+#include <net/gtp.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -723,6 +724,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]        = { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]         = { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN]        = { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_ENC_OPTS_GTP]		= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -746,6 +748,12 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]       = { .type = NLA_U8 },
 };
 
+static const struct nla_policy
+gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE]	   = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI]	   = { .type = NLA_U8 },
+};
+
 static const struct nla_policy
 mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]    = { .type = NLA_U8 },
@@ -1262,6 +1270,49 @@ static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
 	return sizeof(*md);
 }
 
+static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key,
+			  int depth, int option_len,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1];
+	struct gtp_pdu_session_info *sinfo;
+	u8 len = key->enc_opts.len;
+	int err;
+
+	sinfo = (struct gtp_pdu_session_info *)&key->enc_opts.data[len];
+	memset(sinfo, 0xff, option_len);
+
+	if (!depth)
+		return sizeof(*sinfo);
+
+	if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GTP) {
+		NL_SET_ERR_MSG_MOD(extack, "Non-gtp option type for mask");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GTP_MAX, nla,
+			       gtp_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!option_len &&
+	    (!tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE] ||
+	     !tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI])) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing tunnel key gtp option pdu type or qfi");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE])
+		sinfo->pdu_type =
+			nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE]);
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI])
+		sinfo->qfi = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI]);
+
+	return sizeof(*sinfo);
+}
+
 static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  struct fl_flow_key *mask,
 			  struct netlink_ext_ack *extack)
@@ -1386,6 +1437,38 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 			break;
+		case TCA_FLOWER_KEY_ENC_OPTS_GTP:
+			if (key->enc_opts.dst_opt_type) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "Duplicate type for gtp options");
+				return -EINVAL;
+			}
+			option_len = 0;
+			key->enc_opts.dst_opt_type = TUNNEL_GTP_OPT;
+			option_len = fl_set_gtp_opt(nla_opt_key, key,
+						    key_depth, option_len,
+						    extack);
+			if (option_len < 0)
+				return option_len;
+
+			key->enc_opts.len += option_len;
+			/* At the same time we need to parse through the mask
+			 * in order to verify exact and mask attribute lengths.
+			 */
+			mask->enc_opts.dst_opt_type = TUNNEL_GTP_OPT;
+			option_len = fl_set_gtp_opt(nla_opt_msk, mask,
+						    msk_depth, option_len,
+						    extack);
+			if (option_len < 0)
+				return option_len;
+
+			mask->enc_opts.len += option_len;
+			if (key->enc_opts.len != mask->enc_opts.len) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "Key and mask miss aligned");
+				return -EINVAL;
+			}
+			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
 			return -EINVAL;
@@ -2761,6 +2844,34 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_gtp_opt(struct sk_buff *skb,
+			       struct flow_dissector_key_enc_opts *enc_opts)
+
+{
+	struct gtp_pdu_session_info *session_info;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_GTP);
+	if (!nest)
+		goto nla_put_failure;
+
+	session_info = (struct gtp_pdu_session_info *)&enc_opts->data[0];
+
+	if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE,
+		       session_info->pdu_type))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GTP_QFI, session_info->qfi))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fl_dump_key_ct(struct sk_buff *skb,
 			  struct flow_dissector_key_ct *key,
 			  struct flow_dissector_key_ct *mask)
@@ -2824,6 +2935,11 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 		if (err)
 			goto nla_put_failure;
 		break;
+	case TUNNEL_GTP_OPT:
+		err = fl_dump_key_gtp_opt(skb, enc_opts);
+		if (err)
+			goto nla_put_failure;
+		break;
 	default:
 		goto nla_put_failure;
 	}
-- 
cgit v1.2.3


From 3b6c6c039707f6bb7c64af2aa82a437fabb93aee Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:48 -0800
Subject: nvdimm/region: Delete nd_blk_region infrastructure

Now that the nd_namespace_blk infrastructure is removed, delete all the
region machinery to coordinate provisioning aliased capacity between
PMEM and BLK.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688418803.2879318.1302315202397235855.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c           |  11 +-
 drivers/nvdimm/bus.c               |   2 -
 drivers/nvdimm/dimm_devs.c         | 204 +++----------------------------------
 drivers/nvdimm/label.c             |   6 +-
 drivers/nvdimm/label.h             |   2 +-
 drivers/nvdimm/namespace_devs.c    | 127 +++--------------------
 drivers/nvdimm/nd-core.h           |  24 +----
 drivers/nvdimm/nd.h                |  12 ---
 drivers/nvdimm/region.c            |  31 ++----
 drivers/nvdimm/region_devs.c       | 158 ++++------------------------
 include/linux/libnvdimm.h          |  24 -----
 include/uapi/linux/ndctl.h         |   2 -
 tools/testing/nvdimm/test/ndtest.c |  67 +-----------
 13 files changed, 66 insertions(+), 604 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index bea6a219fddd..fe61f617a943 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2036,10 +2036,6 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
 		}
 
-		/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
-		if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
-			set_bit(NDD_NOBLK, &flags);
-
 		if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
 			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
 			set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
@@ -2602,8 +2598,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 {
 	static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS];
 	struct acpi_nfit_system_address *spa = nfit_spa->spa;
-	struct nd_blk_region_desc ndbr_desc;
-	struct nd_region_desc *ndr_desc;
+	struct nd_region_desc *ndr_desc, _ndr_desc;
 	struct nfit_memdev *nfit_memdev;
 	struct nvdimm_bus *nvdimm_bus;
 	struct resource res;
@@ -2619,10 +2614,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	memset(&res, 0, sizeof(res));
 	memset(&mappings, 0, sizeof(mappings));
-	memset(&ndbr_desc, 0, sizeof(ndbr_desc));
+	memset(&_ndr_desc, 0, sizeof(_ndr_desc));
 	res.start = spa->address;
 	res.end = res.start + spa->length - 1;
-	ndr_desc = &ndbr_desc.ndr_desc;
+	ndr_desc = &_ndr_desc;
 	ndr_desc->res = &res;
 	ndr_desc->provider_data = nfit_spa;
 	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 9dc7f3edd42b..a4b5f637e599 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -35,8 +35,6 @@ static int to_nd_device_type(struct device *dev)
 		return ND_DEVICE_DIMM;
 	else if (is_memory(dev))
 		return ND_DEVICE_REGION_PMEM;
-	else if (is_nd_blk(dev))
-		return ND_DEVICE_REGION_BLK;
 	else if (is_nd_dax(dev))
 		return ND_DEVICE_DAX_PMEM;
 	else if (is_nd_region(dev->parent))
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index dc7449a40003..ee507eed42b5 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -18,10 +18,6 @@
 
 static DEFINE_IDA(dimm_ida);
 
-static bool noblk;
-module_param(noblk, bool, 0444);
-MODULE_PARM_DESC(noblk, "force disable BLK / local alias support");
-
 /*
  * Retrieve bus and dimm handle and return if this bus supports
  * get_config_data commands
@@ -211,22 +207,6 @@ struct nvdimm *to_nvdimm(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nvdimm);
 
-struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
-{
-	struct nd_region *nd_region = &ndbr->nd_region;
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-
-	return nd_mapping->nvdimm;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
-
-unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr)
-{
-	/* pmem mapping properties are private to libnvdimm */
-	return ARCH_MEMREMAP_PMEM;
-}
-EXPORT_SYMBOL_GPL(nd_blk_memremap_flags);
-
 struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
 {
 	struct nvdimm *nvdimm = nd_mapping->nvdimm;
@@ -312,8 +292,7 @@ static ssize_t flags_show(struct device *dev,
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	return sprintf(buf, "%s%s%s\n",
-			test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+	return sprintf(buf, "%s%s\n",
 			test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
 			test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
 }
@@ -612,8 +591,6 @@ struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 
 	nvdimm->dimm_id = dimm_id;
 	nvdimm->provider_data = provider_data;
-	if (noblk)
-		flags |= 1 << NDD_NOBLK;
 	nvdimm->flags = flags;
 	nvdimm->cmd_mask = cmd_mask;
 	nvdimm->num_flush = num_flush;
@@ -726,133 +703,6 @@ static unsigned long dpa_align(struct nd_region *nd_region)
 	return nd_region->align / nd_region->ndr_mappings;
 }
 
-int alias_dpa_busy(struct device *dev, void *data)
-{
-	resource_size_t map_end, blk_start, new;
-	struct blk_alloc_info *info = data;
-	struct nd_mapping *nd_mapping;
-	struct nd_region *nd_region;
-	struct nvdimm_drvdata *ndd;
-	struct resource *res;
-	unsigned long align;
-	int i;
-
-	if (!is_memory(dev))
-		return 0;
-
-	nd_region = to_nd_region(dev);
-	for (i = 0; i < nd_region->ndr_mappings; i++) {
-		nd_mapping  = &nd_region->mapping[i];
-		if (nd_mapping->nvdimm == info->nd_mapping->nvdimm)
-			break;
-	}
-
-	if (i >= nd_region->ndr_mappings)
-		return 0;
-
-	ndd = to_ndd(nd_mapping);
-	map_end = nd_mapping->start + nd_mapping->size - 1;
-	blk_start = nd_mapping->start;
-
-	/*
-	 * In the allocation case ->res is set to free space that we are
-	 * looking to validate against PMEM aliasing collision rules
-	 * (i.e. BLK is allocated after all aliased PMEM).
-	 */
-	if (info->res) {
-		if (info->res->start >= nd_mapping->start
-				&& info->res->start < map_end)
-			/* pass */;
-		else
-			return 0;
-	}
-
- retry:
-	/*
-	 * Find the free dpa from the end of the last pmem allocation to
-	 * the end of the interleave-set mapping.
-	 */
-	align = dpa_align(nd_region);
-	if (!align)
-		return 0;
-
-	for_each_dpa_resource(ndd, res) {
-		resource_size_t start, end;
-
-		if (strncmp(res->name, "pmem", 4) != 0)
-			continue;
-
-		start = ALIGN_DOWN(res->start, align);
-		end = ALIGN(res->end + 1, align) - 1;
-		if ((start >= blk_start && start < map_end)
-				|| (end >= blk_start && end <= map_end)) {
-			new = max(blk_start, min(map_end, end) + 1);
-			if (new != blk_start) {
-				blk_start = new;
-				goto retry;
-			}
-		}
-	}
-
-	/* update the free space range with the probed blk_start */
-	if (info->res && blk_start > info->res->start) {
-		info->res->start = max(info->res->start, blk_start);
-		if (info->res->start > info->res->end)
-			info->res->end = info->res->start - 1;
-		return 1;
-	}
-
-	info->available -= blk_start - nd_mapping->start;
-
-	return 0;
-}
-
-/**
- * nd_blk_available_dpa - account the unused dpa of BLK region
- * @nd_mapping: container of dpa-resource-root + labels
- *
- * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but
- * we arrange for them to never start at an lower dpa than the last
- * PMEM allocation in an aliased region.
- */
-resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
-{
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct blk_alloc_info info = {
-		.nd_mapping = nd_mapping,
-		.available = nd_mapping->size,
-		.res = NULL,
-	};
-	struct resource *res;
-	unsigned long align;
-
-	if (!ndd)
-		return 0;
-
-	device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
-
-	/* now account for busy blk allocations in unaliased dpa */
-	align = dpa_align(nd_region);
-	if (!align)
-		return 0;
-	for_each_dpa_resource(ndd, res) {
-		resource_size_t start, end, size;
-
-		if (strncmp(res->name, "blk", 3) != 0)
-			continue;
-		start = ALIGN_DOWN(res->start, align);
-		end = ALIGN(res->end + 1, align) - 1;
-		size = end - start + 1;
-		if (size >= info.available)
-			return 0;
-		info.available -= size;
-	}
-
-	return info.available;
-}
-
 /**
  * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max
  *			   contiguous unallocated dpa range.
@@ -900,24 +750,16 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
  * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
  * @nd_mapping: container of dpa-resource-root + labels
  * @nd_region: constrain available space check to this reference region
- * @overlap: calculate available space assuming this level of overlap
  *
  * Validate that a PMEM label, if present, aligns with the start of an
- * interleave set and truncate the available size at the lowest BLK
- * overlap point.
- *
- * The expectation is that this routine is called multiple times as it
- * probes for the largest BLK encroachment for any single member DIMM of
- * the interleave set.  Once that value is determined the PMEM-limit for
- * the set can be established.
+ * interleave set.
  */
 resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
-		struct nd_mapping *nd_mapping, resource_size_t *overlap)
+				      struct nd_mapping *nd_mapping)
 {
-	resource_size_t map_start, map_end, busy = 0, available, blk_start;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t map_start, map_end, busy = 0;
 	struct resource *res;
-	const char *reason;
 	unsigned long align;
 
 	if (!ndd)
@@ -929,46 +771,28 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
 
 	map_start = nd_mapping->start;
 	map_end = map_start + nd_mapping->size - 1;
-	blk_start = max(map_start, map_end + 1 - *overlap);
 	for_each_dpa_resource(ndd, res) {
 		resource_size_t start, end;
 
 		start = ALIGN_DOWN(res->start, align);
 		end = ALIGN(res->end + 1, align) - 1;
 		if (start >= map_start && start < map_end) {
-			if (strncmp(res->name, "blk", 3) == 0)
-				blk_start = min(blk_start,
-						max(map_start, start));
-			else if (end > map_end) {
-				reason = "misaligned to iset";
-				goto err;
-			} else
-				busy += end - start + 1;
+			if (end > map_end) {
+				nd_dbg_dpa(nd_region, ndd, res,
+					   "misaligned to iset\n");
+				return 0;
+			}
+			busy += end - start + 1;
 		} else if (end >= map_start && end <= map_end) {
-			if (strncmp(res->name, "blk", 3) == 0) {
-				/*
-				 * If a BLK allocation overlaps the start of
-				 * PMEM the entire interleave set may now only
-				 * be used for BLK.
-				 */
-				blk_start = map_start;
-			} else
-				busy += end - start + 1;
+			busy += end - start + 1;
 		} else if (map_start > start && map_start < end) {
 			/* total eclipse of the mapping */
 			busy += nd_mapping->size;
-			blk_start = map_start;
 		}
 	}
 
-	*overlap = map_end + 1 - blk_start;
-	available = blk_start - map_start;
-	if (busy < available)
-		return ALIGN_DOWN(available - busy, align);
-	return 0;
-
- err:
-	nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
+	if (busy < nd_mapping->size)
+		return ALIGN_DOWN(nd_mapping->size - busy, align);
 	return 0;
 }
 
@@ -999,7 +823,7 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 /**
  * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
  * @nvdimm: container of dpa-resource-root + labels
- * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
+ * @label_id: dpa resource name of the form pmem-<human readable uuid>
  */
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 8c972bcb2ac3..082253a3a956 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -334,8 +334,7 @@ char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid,
 {
 	if (!label_id || !uuid)
 		return NULL;
-	snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
-			flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
+	snprintf(label_id->id, ND_LABEL_ID_SIZE, "pmem-%pUb", uuid);
 	return label_id->id;
 }
 
@@ -406,7 +405,6 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 		return 0; /* no label, nothing to reserve */
 
 	for_each_clear_bit_le(slot, free, nslot) {
-		struct nvdimm *nvdimm = to_nvdimm(ndd->dev);
 		struct nd_namespace_label *nd_label;
 		struct nd_region *nd_region = NULL;
 		struct nd_label_id label_id;
@@ -421,8 +419,6 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 
 		nsl_get_uuid(ndd, nd_label, &label_uuid);
 		flags = nsl_get_flags(ndd, nd_label);
-		if (test_bit(NDD_NOBLK, &nvdimm->flags))
-			flags &= ~NSLABEL_FLAG_LOCAL;
 		nd_label_gen_id(&label_id, &label_uuid, flags);
 		res = nvdimm_allocate_dpa(ndd, &label_id,
 					  nsl_get_dpa(ndd, nd_label),
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
index 198ef1df298b..0650fb4b9821 100644
--- a/drivers/nvdimm/label.h
+++ b/drivers/nvdimm/label.h
@@ -193,7 +193,7 @@ struct nd_namespace_label {
 
 /**
  * struct nd_label_id - identifier string for dpa allocation
- * @id: "{blk|pmem}-<namespace uuid>"
+ * @id: "pmem-<namespace uuid>"
  */
 struct nd_label_id {
 	char id[ND_LABEL_ID_SIZE];
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index d1c190b02657..62b83b2e26e3 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -297,13 +297,11 @@ static int scan_free(struct nd_region *nd_region,
 		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
 		resource_size_t n)
 {
-	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	int rc = 0;
 
 	while (n) {
 		struct resource *res, *last;
-		resource_size_t new_start;
 
 		last = NULL;
 		for_each_dpa_resource(ndd, res)
@@ -321,16 +319,7 @@ static int scan_free(struct nd_region *nd_region,
 			continue;
 		}
 
-		/*
-		 * Keep BLK allocations relegated to high DPA as much as
-		 * possible
-		 */
-		if (is_blk)
-			new_start = res->start + n;
-		else
-			new_start = res->start;
-
-		rc = adjust_resource(res, new_start, resource_size(res) - n);
+		rc = adjust_resource(res, res->start, resource_size(res) - n);
 		if (rc == 0)
 			res->flags |= DPA_RESOURCE_ADJUSTED;
 		nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
@@ -372,20 +361,12 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
 		struct nd_region *nd_region, struct nd_mapping *nd_mapping,
 		resource_size_t n)
 {
-	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	resource_size_t first_dpa;
 	struct resource *res;
 	int rc = 0;
 
-	/* allocate blk from highest dpa first */
-	if (is_blk)
-		first_dpa = nd_mapping->start + nd_mapping->size - n;
-	else
-		first_dpa = nd_mapping->start;
-
 	/* first resource allocation for this label-id or dimm */
-	res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
+	res = nvdimm_allocate_dpa(ndd, label_id, nd_mapping->start, n);
 	if (!res)
 		rc = -EBUSY;
 
@@ -416,7 +397,6 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
 		resource_size_t n, struct resource *valid)
 {
 	bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	unsigned long align;
 
 	align = nd_region->align / nd_region->ndr_mappings;
@@ -429,21 +409,6 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
 	if (is_reserve)
 		return;
 
-	if (!is_pmem) {
-		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-		struct nvdimm_bus *nvdimm_bus;
-		struct blk_alloc_info info = {
-			.nd_mapping = nd_mapping,
-			.available = nd_mapping->size,
-			.res = valid,
-		};
-
-		WARN_ON(!is_nd_blk(&nd_region->dev));
-		nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-		device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
-		return;
-	}
-
 	/* allocation needs to be contiguous, so this is all or nothing */
 	if (resource_size(valid) < n)
 		goto invalid;
@@ -471,7 +436,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		resource_size_t n)
 {
 	resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct resource *res, *exist = NULL, valid;
 	const resource_size_t to_allocate = n;
@@ -569,10 +533,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		}
 
 		if (strcmp(action, "allocate") == 0) {
-			/* BLK allocate bottom up */
-			if (!is_pmem)
-				valid.start += available - allocate;
-
 			new_res = nvdimm_allocate_dpa(ndd, label_id,
 					valid.start, allocate);
 			if (!new_res)
@@ -608,12 +568,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 			return 0;
 	}
 
-	/*
-	 * If we allocated nothing in the BLK case it may be because we are in
-	 * an initial "pmem-reserve pass".  Only do an initial BLK allocation
-	 * when none of the DPA space is reserved.
-	 */
-	if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
+	if (n == to_allocate)
 		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
 	return n;
 }
@@ -672,7 +627,7 @@ int __reserve_free_pmem(struct device *dev, void *data)
 		if (nd_mapping->nvdimm != nvdimm)
 			continue;
 
-		n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
+		n = nd_pmem_available_dpa(nd_region, nd_mapping);
 		if (n == 0)
 			return 0;
 		rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
@@ -697,19 +652,6 @@ void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
 			nvdimm_free_dpa(ndd, res);
 }
 
-static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
-		struct nd_mapping *nd_mapping)
-{
-	struct nvdimm *nvdimm = nd_mapping->nvdimm;
-	int rc;
-
-	rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
-			__reserve_free_pmem);
-	if (rc)
-		release_free_pmem(nvdimm_bus, nd_mapping);
-	return rc;
-}
-
 /**
  * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
  * @nd_region: the set of dimms to allocate @n more bytes from
@@ -726,37 +668,14 @@ static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
 static int grow_dpa_allocation(struct nd_region *nd_region,
 		struct nd_label_id *label_id, resource_size_t n)
 {
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	int i;
 
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 		resource_size_t rem = n;
-		int rc, j;
-
-		/*
-		 * In the BLK case try once with all unallocated PMEM
-		 * reserved, and once without
-		 */
-		for (j = is_pmem; j < 2; j++) {
-			bool blk_only = j == 0;
-
-			if (blk_only) {
-				rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
-				if (rc)
-					return rc;
-			}
-			rem = scan_allocate(nd_region, nd_mapping,
-					label_id, rem);
-			if (blk_only)
-				release_free_pmem(nvdimm_bus, nd_mapping);
-
-			/* try again and allow encroachments into PMEM */
-			if (rem == 0)
-				break;
-		}
+		int rc;
 
+		rem = scan_allocate(nd_region, nd_mapping, label_id, rem);
 		dev_WARN_ONCE(&nd_region->dev, rem,
 				"allocation underrun: %#llx of %#llx bytes\n",
 				(unsigned long long) n - rem,
@@ -869,8 +788,8 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 		ndd = to_ndd(nd_mapping);
 
 		/*
-		 * All dimms in an interleave set, or the base dimm for a blk
-		 * region, need to be enabled for the size to be changed.
+		 * All dimms in an interleave set, need to be enabled
+		 * for the size to be changed.
 		 */
 		if (!ndd)
 			return -ENXIO;
@@ -1169,9 +1088,6 @@ static ssize_t resource_show(struct device *dev,
 }
 static DEVICE_ATTR_ADMIN_RO(resource);
 
-static const unsigned long blk_lbasize_supported[] = { 512, 520, 528,
-	4096, 4104, 4160, 4224, 0 };
-
 static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
 
 static ssize_t sector_size_show(struct device *dev,
@@ -1823,10 +1739,7 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region,
 	/*
 	 * Fix up each mapping's 'labels' to have the validated pmem label for
 	 * that position at labels[0], and NULL at labels[1].  In the process,
-	 * check that the namespace aligns with interleave-set.  We know
-	 * that it does not overlap with any blk namespaces by virtue of
-	 * the dimm being enabled (i.e. nd_label_reserve_dpa()
-	 * succeeded).
+	 * check that the namespace aligns with interleave-set.
 	 */
 	nsl_get_uuid(ndd, nd_label, &uuid);
 	rc = select_pmem_id(nd_region, &uuid);
@@ -1931,8 +1844,7 @@ void nd_region_create_ns_seed(struct nd_region *nd_region)
 	 * disabled until memory becomes available
 	 */
 	if (!nd_region->ns_seed)
-		dev_err(&nd_region->dev, "failed to create %s namespace\n",
-				is_nd_blk(&nd_region->dev) ? "blk" : "pmem");
+		dev_err(&nd_region->dev, "failed to create namespace\n");
 	else
 		nd_device_register(nd_region->ns_seed);
 }
@@ -2028,16 +1940,9 @@ static struct device **scan_labels(struct nd_region *nd_region)
 	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
 		struct nd_namespace_label *nd_label = label_ent->label;
 		struct device **__devs;
-		u32 flags;
 
 		if (!nd_label)
 			continue;
-		flags = nsl_get_flags(ndd, nd_label);
-		if (is_nd_blk(&nd_region->dev)
-				== !!(flags & NSLABEL_FLAG_LOCAL))
-			/* pass, region matches label type */;
-		else
-			continue;
 
 		/* skip labels that describe extents outside of the region */
 		if (nsl_get_dpa(ndd, nd_label) < nd_mapping->start ||
@@ -2073,9 +1978,8 @@ static struct device **scan_labels(struct nd_region *nd_region)
 
 	}
 
-	dev_dbg(&nd_region->dev, "discovered %d %s namespace%s\n",
-			count, is_nd_blk(&nd_region->dev)
-			? "blk" : "pmem", count == 1 ? "" : "s");
+	dev_dbg(&nd_region->dev, "discovered %d namespace%s\n", count,
+		count == 1 ? "" : "s");
 
 	if (count == 0) {
 		struct nd_namespace_pmem *nspm;
@@ -2226,12 +2130,6 @@ static int init_active_labels(struct nd_region *nd_region)
 			if (!label_ent)
 				break;
 			label = nd_label_active(ndd, j);
-			if (test_bit(NDD_NOBLK, &nvdimm->flags)) {
-				u32 flags = nsl_get_flags(ndd, label);
-
-				flags &= ~NSLABEL_FLAG_LOCAL;
-				nsl_set_flags(ndd, label, flags);
-			}
 			label_ent->label = label;
 
 			mutex_lock(&nd_mapping->lock);
@@ -2275,7 +2173,6 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 		devs = create_namespace_io(nd_region);
 		break;
 	case ND_DEVICE_NAMESPACE_PMEM:
-	case ND_DEVICE_NAMESPACE_BLK:
 		devs = create_namespaces(nd_region);
 		break;
 	default:
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index e4af0719cf33..17febf9ac911 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -82,30 +82,12 @@ static inline void nvdimm_security_overwrite_query(struct work_struct *work)
 }
 #endif
 
-/**
- * struct blk_alloc_info - tracking info for BLK dpa scanning
- * @nd_mapping: blk region mapping boundaries
- * @available: decremented in alias_dpa_busy as aliased PMEM is scanned
- * @busy: decremented in blk_dpa_busy to account for ranges already
- * 	  handled by alias_dpa_busy
- * @res: alias_dpa_busy interprets this a free space range that needs to
- * 	 be truncated to the valid BLK allocation starting DPA, blk_dpa_busy
- * 	 treats it as a busy range that needs the aliased PMEM ranges
- * 	 truncated.
- */
-struct blk_alloc_info {
-	struct nd_mapping *nd_mapping;
-	resource_size_t available, busy;
-	struct resource *res;
-};
-
 bool is_nvdimm(struct device *dev);
 bool is_nd_pmem(struct device *dev);
 bool is_nd_volatile(struct device *dev);
-bool is_nd_blk(struct device *dev);
 static inline bool is_nd_region(struct device *dev)
 {
-	return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
 }
 static inline bool is_memory(struct device *dev)
 {
@@ -142,14 +124,12 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
 					   struct nd_mapping *nd_mapping);
 resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region);
 resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
-		struct nd_mapping *nd_mapping, resource_size_t *overlap);
-resource_size_t nd_blk_available_dpa(struct nd_region *nd_region);
+				      struct nd_mapping *nd_mapping);
 resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
 int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
 		resource_size_t size);
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id);
-int alias_dpa_busy(struct device *dev, void *data);
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
 void get_ndd(struct nvdimm_drvdata *ndd);
 resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 8391bf2729bc..ec5219680092 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -295,9 +295,6 @@ static inline const u8 *nsl_uuid_raw(struct nvdimm_drvdata *ndd,
 	return nd_label->efi.uuid;
 }
 
-bool nsl_validate_blk_isetcookie(struct nvdimm_drvdata *ndd,
-				 struct nd_namespace_label *nd_label,
-				 u64 isetcookie);
 bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd,
 			    struct nd_namespace_label *nd_label, guid_t *guid);
 enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd,
@@ -437,14 +434,6 @@ static inline bool nsl_validate_nlabel(struct nd_region *nd_region,
 	return nsl_get_nlabel(ndd, nd_label) == nd_region->ndr_mappings;
 }
 
-struct nd_blk_region {
-	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-			void *iobuf, u64 len, int rw);
-	void *blk_provider_data;
-	struct nd_region nd_region;
-};
-
 /*
  * Lookup next in the repeating sequence of 01, 10, and 11.
  */
@@ -672,7 +661,6 @@ static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 	return -ENXIO;
 }
 #endif
-int nd_blk_region_init(struct nd_region *nd_region);
 int nd_region_activate(struct nd_region *nd_region);
 static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
 		unsigned int len)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index e0c34120df37..188560b1c110 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -15,6 +15,10 @@ static int nd_region_probe(struct device *dev)
 	static unsigned long once;
 	struct nd_region_data *ndrd;
 	struct nd_region *nd_region = to_nd_region(dev);
+	struct range range = {
+		.start = nd_region->ndr_start,
+		.end = nd_region->ndr_start + nd_region->ndr_size - 1,
+	};
 
 	if (nd_region->num_lanes > num_online_cpus()
 			&& nd_region->num_lanes < num_possible_cpus()
@@ -30,25 +34,13 @@ static int nd_region_probe(struct device *dev)
 	if (rc)
 		return rc;
 
-	rc = nd_blk_region_init(nd_region);
-	if (rc)
-		return rc;
-
-	if (is_memory(&nd_region->dev)) {
-		struct range range = {
-			.start = nd_region->ndr_start,
-			.end = nd_region->ndr_start + nd_region->ndr_size - 1,
-		};
-
-		if (devm_init_badblocks(dev, &nd_region->bb))
-			return -ENODEV;
-		nd_region->bb_state = sysfs_get_dirent(nd_region->dev.kobj.sd,
-						       "badblocks");
-		if (!nd_region->bb_state)
-			dev_warn(&nd_region->dev,
-					"'badblocks' notification disabled\n");
-		nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
-	}
+	if (devm_init_badblocks(dev, &nd_region->bb))
+		return -ENODEV;
+	nd_region->bb_state =
+		sysfs_get_dirent(nd_region->dev.kobj.sd, "badblocks");
+	if (!nd_region->bb_state)
+		dev_warn(dev, "'badblocks' notification disabled\n");
+	nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
 
 	rc = nd_region_register_namespaces(nd_region, &err);
 	if (rc < 0)
@@ -158,4 +150,3 @@ void nd_region_exit(void)
 }
 
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
-MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 70ad891a76ba..0cb274c2b508 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -134,10 +134,7 @@ static void nd_region_release(struct device *dev)
 	}
 	free_percpu(nd_region->lane);
 	memregion_free(nd_region->id);
-	if (is_nd_blk(dev))
-		kfree(to_nd_blk_region(dev));
-	else
-		kfree(nd_region);
+	kfree(nd_region);
 }
 
 struct nd_region *to_nd_region(struct device *dev)
@@ -157,33 +154,12 @@ struct device *nd_region_dev(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nd_region_dev);
 
-struct nd_blk_region *to_nd_blk_region(struct device *dev)
-{
-	struct nd_region *nd_region = to_nd_region(dev);
-
-	WARN_ON(!is_nd_blk(dev));
-	return container_of(nd_region, struct nd_blk_region, nd_region);
-}
-EXPORT_SYMBOL_GPL(to_nd_blk_region);
-
 void *nd_region_provider_data(struct nd_region *nd_region)
 {
 	return nd_region->provider_data;
 }
 EXPORT_SYMBOL_GPL(nd_region_provider_data);
 
-void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
-{
-	return ndbr->blk_provider_data;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
-
-void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
-{
-	ndbr->blk_provider_data = data;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
-
 /**
  * nd_region_to_nstype() - region to an integer namespace type
  * @nd_region: region-device to interrogate
@@ -208,8 +184,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 			return ND_DEVICE_NAMESPACE_PMEM;
 		else
 			return ND_DEVICE_NAMESPACE_IO;
-	} else if (is_nd_blk(&nd_region->dev)) {
-		return ND_DEVICE_NAMESPACE_BLK;
 	}
 
 	return 0;
@@ -332,14 +306,12 @@ static DEVICE_ATTR_RO(set_cookie);
 
 resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 {
-	resource_size_t blk_max_overlap = 0, available, overlap;
+	resource_size_t available;
 	int i;
 
 	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
 
- retry:
 	available = 0;
-	overlap = blk_max_overlap;
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
@@ -348,15 +320,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 		if (!ndd)
 			return 0;
 
-		if (is_memory(&nd_region->dev)) {
-			available += nd_pmem_available_dpa(nd_region,
-					nd_mapping, &overlap);
-			if (overlap > blk_max_overlap) {
-				blk_max_overlap = overlap;
-				goto retry;
-			}
-		} else if (is_nd_blk(&nd_region->dev))
-			available += nd_blk_available_dpa(nd_region);
+		available += nd_pmem_available_dpa(nd_region, nd_mapping);
 	}
 
 	return available;
@@ -364,26 +328,17 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 
 resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region)
 {
-	resource_size_t available = 0;
+	resource_size_t avail = 0;
 	int i;
 
-	if (is_memory(&nd_region->dev))
-		available = PHYS_ADDR_MAX;
-
 	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 
-		if (is_memory(&nd_region->dev))
-			available = min(available,
-					nd_pmem_max_contiguous_dpa(nd_region,
-								   nd_mapping));
-		else if (is_nd_blk(&nd_region->dev))
-			available += nd_blk_available_dpa(nd_region);
+		avail = min_not_zero(avail, nd_pmem_max_contiguous_dpa(
+						    nd_region, nd_mapping));
 	}
-	if (is_memory(&nd_region->dev))
-		return available * nd_region->ndr_mappings;
-	return available;
+	return avail * nd_region->ndr_mappings;
 }
 
 static ssize_t available_size_show(struct device *dev,
@@ -693,9 +648,8 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
 
-	if ((type == ND_DEVICE_NAMESPACE_PMEM
-				|| type == ND_DEVICE_NAMESPACE_BLK)
-			&& a == &dev_attr_available_size.attr)
+	if (type == ND_DEVICE_NAMESPACE_PMEM &&
+	    a == &dev_attr_available_size.attr)
 		return a->mode;
 	else if (is_memory(dev) && nd_set)
 		return a->mode;
@@ -828,12 +782,6 @@ static const struct attribute_group *nd_region_attribute_groups[] = {
 	NULL,
 };
 
-static const struct device_type nd_blk_device_type = {
-	.name = "nd_blk",
-	.release = nd_region_release,
-	.groups = nd_region_attribute_groups,
-};
-
 static const struct device_type nd_pmem_device_type = {
 	.name = "nd_pmem",
 	.release = nd_region_release,
@@ -851,11 +799,6 @@ bool is_nd_pmem(struct device *dev)
 	return dev ? dev->type == &nd_pmem_device_type : false;
 }
 
-bool is_nd_blk(struct device *dev)
-{
-	return dev ? dev->type == &nd_blk_device_type : false;
-}
-
 bool is_nd_volatile(struct device *dev)
 {
 	return dev ? dev->type == &nd_volatile_device_type : false;
@@ -929,22 +872,6 @@ void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev)
 	nvdimm_bus_unlock(dev);
 }
 
-int nd_blk_region_init(struct nd_region *nd_region)
-{
-	struct device *dev = &nd_region->dev;
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-
-	if (!is_nd_blk(dev))
-		return 0;
-
-	if (nd_region->ndr_mappings < 1) {
-		dev_dbg(dev, "invalid BLK region\n");
-		return -ENXIO;
-	}
-
-	return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
-}
-
 /**
  * nd_region_acquire_lane - allocate and lock a lane
  * @nd_region: region id and number of lanes possible
@@ -1007,24 +934,10 @@ EXPORT_SYMBOL(nd_region_release_lane);
 static unsigned long default_align(struct nd_region *nd_region)
 {
 	unsigned long align;
-	int i, mappings;
 	u32 remainder;
+	int mappings;
 
-	if (is_nd_blk(&nd_region->dev))
-		align = PAGE_SIZE;
-	else
-		align = MEMREMAP_COMPAT_ALIGN_MAX;
-
-	for (i = 0; i < nd_region->ndr_mappings; i++) {
-		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
-		struct nvdimm *nvdimm = nd_mapping->nvdimm;
-
-		if (test_bit(NDD_ALIASING, &nvdimm->flags)) {
-			align = MEMREMAP_COMPAT_ALIGN_MAX;
-			break;
-		}
-	}
-
+	align = MEMREMAP_COMPAT_ALIGN_MAX;
 	if (nd_region->ndr_size < MEMREMAP_COMPAT_ALIGN_MAX)
 		align = PAGE_SIZE;
 
@@ -1042,7 +955,6 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 {
 	struct nd_region *nd_region;
 	struct device *dev;
-	void *region_buf;
 	unsigned int i;
 	int ro = 0;
 
@@ -1060,36 +972,13 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
 
-		if (test_bit(NDD_NOBLK, &nvdimm->flags)
-				&& dev_type == &nd_blk_device_type) {
-			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not BLK capable\n",
-					caller, dev_name(&nvdimm->dev), i);
-			return NULL;
-		}
 	}
 
-	if (dev_type == &nd_blk_device_type) {
-		struct nd_blk_region_desc *ndbr_desc;
-		struct nd_blk_region *ndbr;
-
-		ndbr_desc = to_blk_region_desc(ndr_desc);
-		ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
-				* ndr_desc->num_mappings,
-				GFP_KERNEL);
-		if (ndbr) {
-			nd_region = &ndbr->nd_region;
-			ndbr->enable = ndbr_desc->enable;
-			ndbr->do_io = ndbr_desc->do_io;
-		}
-		region_buf = ndbr;
-	} else {
-		nd_region = kzalloc(struct_size(nd_region, mapping,
-						ndr_desc->num_mappings),
-				    GFP_KERNEL);
-		region_buf = nd_region;
-	}
+	nd_region =
+		kzalloc(struct_size(nd_region, mapping, ndr_desc->num_mappings),
+			GFP_KERNEL);
 
-	if (!region_buf)
+	if (!nd_region)
 		return NULL;
 	nd_region->id = memregion_alloc(GFP_KERNEL);
 	if (nd_region->id < 0)
@@ -1153,7 +1042,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
  err_percpu:
 	memregion_free(nd_region->id);
  err_id:
-	kfree(region_buf);
+	kfree(nd_region);
 	return NULL;
 }
 
@@ -1166,17 +1055,6 @@ struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
 
-struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
-		struct nd_region_desc *ndr_desc)
-{
-	if (ndr_desc->num_mappings > 1)
-		return NULL;
-	ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
-	return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
-			__func__);
-}
-EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
-
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc)
 {
@@ -1201,7 +1079,7 @@ int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
 }
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
- * @nd_region: blk or interleaved pmem region
+ * @nd_region: interleaved pmem region
  */
 int generic_nvdimm_flush(struct nd_region *nd_region)
 {
@@ -1234,7 +1112,7 @@ EXPORT_SYMBOL_GPL(nvdimm_flush);
 
 /**
  * nvdimm_has_flush - determine write flushing requirements
- * @nd_region: blk or interleaved pmem region
+ * @nd_region: interleaved pmem region
  *
  * Returns 1 if writes require flushing
  * Returns 0 if writes do not require flushing
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 7074aa9af525..0d61e07b6827 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -25,8 +25,6 @@ struct badrange {
 };
 
 enum {
-	/* when a dimm supports both PMEM and BLK access a label is required */
-	NDD_ALIASING = 0,
 	/* unarmed memory devices may not persist writes */
 	NDD_UNARMED = 1,
 	/* locked memory devices should not be accessed */
@@ -35,8 +33,6 @@ enum {
 	NDD_SECURITY_OVERWRITE = 3,
 	/*  tracking whether or not there is a pending device reference */
 	NDD_WORK_PENDING = 4,
-	/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
-	NDD_NOBLK = 5,
 	/* dimm supports namespace labels */
 	NDD_LABELING = 6,
 
@@ -140,21 +136,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev,
 }
 
 struct nvdimm_bus;
-struct module;
-struct nd_blk_region;
-struct nd_blk_region_desc {
-	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-			void *iobuf, u64 len, int rw);
-	struct nd_region_desc ndr_desc;
-};
-
-static inline struct nd_blk_region_desc *to_blk_region_desc(
-		struct nd_region_desc *ndr_desc)
-{
-	return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc);
-
-}
 
 /*
  * Note that separate bits for locked + unlocked are defined so that
@@ -257,7 +238,6 @@ struct nvdimm_bus *nvdimm_to_bus(struct nvdimm *nvdimm);
 struct nvdimm *to_nvdimm(struct device *dev);
 struct nd_region *to_nd_region(struct device *dev);
 struct device *nd_region_dev(struct nd_region *nd_region);
-struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
@@ -295,10 +275,6 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
 void *nd_region_provider_data(struct nd_region *nd_region);
-void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
-void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
-struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
-unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 8cf1e4884fd5..17e02b64ea2e 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -189,7 +189,6 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_DEVICE_REGION_BLK 3      /* nd_region: (parent of BLK namespaces) */
 #define ND_DEVICE_NAMESPACE_IO 4    /* legacy persistent memory */
 #define ND_DEVICE_NAMESPACE_PMEM 5  /* PMEM namespace (may alias with BLK) */
-#define ND_DEVICE_NAMESPACE_BLK 6   /* BLK namespace (may alias with PMEM) */
 #define ND_DEVICE_DAX_PMEM 7        /* Device DAX interface to pmem */
 
 enum nd_driver_flags {
@@ -198,7 +197,6 @@ enum nd_driver_flags {
 	ND_DRIVER_REGION_BLK      = 1 << ND_DEVICE_REGION_BLK,
 	ND_DRIVER_NAMESPACE_IO    = 1 << ND_DEVICE_NAMESPACE_IO,
 	ND_DRIVER_NAMESPACE_PMEM  = 1 << ND_DEVICE_NAMESPACE_PMEM,
-	ND_DRIVER_NAMESPACE_BLK   = 1 << ND_DEVICE_NAMESPACE_BLK,
 	ND_DRIVER_DAX_PMEM	  = 1 << ND_DEVICE_DAX_PMEM,
 };
 
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index 3ca7c32e9362..4d1a947367f9 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -338,62 +338,6 @@ static int ndtest_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	return 0;
 }
 
-static int ndtest_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-		void *iobuf, u64 len, int rw)
-{
-	struct ndtest_dimm *dimm = ndbr->blk_provider_data;
-	struct ndtest_blk_mmio *mmio = dimm->mmio;
-	struct nd_region *nd_region = &ndbr->nd_region;
-	unsigned int lane;
-
-	if (!mmio)
-		return -ENOMEM;
-
-	lane = nd_region_acquire_lane(nd_region);
-	if (rw)
-		memcpy(mmio->base + dpa, iobuf, len);
-	else {
-		memcpy(iobuf, mmio->base + dpa, len);
-		arch_invalidate_pmem(mmio->base + dpa, len);
-	}
-
-	nd_region_release_lane(nd_region, lane);
-
-	return 0;
-}
-
-static int ndtest_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
-				    struct device *dev)
-{
-	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
-	struct nvdimm *nvdimm;
-	struct ndtest_dimm *dimm;
-	struct ndtest_blk_mmio *mmio;
-
-	nvdimm = nd_blk_region_to_dimm(ndbr);
-	dimm = nvdimm_provider_data(nvdimm);
-
-	nd_blk_region_set_provider_data(ndbr, dimm);
-	dimm->blk_region = to_nd_region(dev);
-
-	mmio = devm_kzalloc(dev, sizeof(struct ndtest_blk_mmio), GFP_KERNEL);
-	if (!mmio)
-		return -ENOMEM;
-
-	mmio->base = (void __iomem *) devm_nvdimm_memremap(
-		dev, dimm->address, 12, nd_blk_memremap_flags(ndbr));
-	if (!mmio->base) {
-		dev_err(dev, "%s failed to map blk dimm\n", nvdimm_name(nvdimm));
-		return -ENOMEM;
-	}
-	mmio->size = dimm->size;
-	mmio->base_offset = 0;
-
-	dimm->mmio = mmio;
-
-	return 0;
-}
-
 static struct nfit_test_resource *ndtest_resource_lookup(resource_size_t addr)
 {
 	int i;
@@ -523,17 +467,16 @@ static int ndtest_create_region(struct ndtest_priv *p,
 				struct ndtest_region *region)
 {
 	struct nd_mapping_desc mappings[NDTEST_MAX_MAPPING];
-	struct nd_blk_region_desc ndbr_desc;
+	struct nd_region_desc *ndr_desc, _ndr_desc;
 	struct nd_interleave_set *nd_set;
-	struct nd_region_desc *ndr_desc;
 	struct resource res;
 	int i, ndimm = region->mapping[0].dimm;
 	u64 uuid[2];
 
 	memset(&res, 0, sizeof(res));
 	memset(&mappings, 0, sizeof(mappings));
-	memset(&ndbr_desc, 0, sizeof(ndbr_desc));
-	ndr_desc = &ndbr_desc.ndr_desc;
+	memset(&_ndr_desc, 0, sizeof(_ndr_desc));
+	ndr_desc = &_ndr_desc;
 
 	if (!ndtest_alloc_resource(p, region->size, &res.start))
 		return -ENOMEM;
@@ -857,10 +800,8 @@ static int ndtest_dimm_register(struct ndtest_priv *priv,
 	struct device *dev = &priv->pdev.dev;
 	unsigned long dimm_flags = dimm->flags;
 
-	if (dimm->num_formats > 1) {
-		set_bit(NDD_ALIASING, &dimm_flags);
+	if (dimm->num_formats > 1)
 		set_bit(NDD_LABELING, &dimm_flags);
-	}
 
 	if (dimm->flags & PAPR_PMEM_UNARMED_MASK)
 		set_bit(NDD_UNARMED, &dimm_flags);
-- 
cgit v1.2.3


From 8109517b394e6deab5fd21cc5460e82ffed229c6 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Mon, 24 Jan 2022 11:25:24 +0100
Subject: rpmsg: ctrl: Introduce new RPMSG_CREATE/RELEASE_DEV_IOCTL controls

Allow the user space application to create and release an rpmsg device
by adding RPMSG_CREATE_DEV_IOCTL and RPMSG_RELEASE_DEV_IOCTL ioctrls to
the /dev/rpmsg_ctrl interface

The RPMSG_CREATE_DEV_IOCTL Ioctl can be used to instantiate a local rpmsg
device.
Depending on the back-end implementation, the associated rpmsg driver is
probed and a NS announcement can be sent to the remote processor.

The RPMSG_RELEASE_DEV_IOCTL allows the user application to release a
rpmsg device created either by the remote processor or with the
RPMSG_CREATE_DEV_IOCTL call.
Depending on the back-end implementation, the associated rpmsg driver is
removed and a NS destroy rpmsg can be sent to the remote processor.

Suggested-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220124102524.295783-12-arnaud.pouliquen@foss.st.com
---
 drivers/rpmsg/rpmsg_ctrl.c | 36 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/rpmsg.h | 10 ++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/rpmsg/rpmsg_ctrl.c b/drivers/rpmsg/rpmsg_ctrl.c
index f43b5e4dbb4c..107da70fdbaa 100644
--- a/drivers/rpmsg/rpmsg_ctrl.c
+++ b/drivers/rpmsg/rpmsg_ctrl.c
@@ -43,11 +43,13 @@ static DEFINE_IDA(rpmsg_minor_ida);
  * @rpdev:	underlaying rpmsg device
  * @cdev:	cdev for the ctrl device
  * @dev:	device for the ctrl device
+ * @ctrl_lock:	serialize the ioctrls.
  */
 struct rpmsg_ctrldev {
 	struct rpmsg_device *rpdev;
 	struct cdev cdev;
 	struct device dev;
+	struct mutex ctrl_lock;
 };
 
 static int rpmsg_ctrldev_open(struct inode *inode, struct file *filp)
@@ -76,9 +78,8 @@ static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned int cmd,
 	void __user *argp = (void __user *)arg;
 	struct rpmsg_endpoint_info eptinfo;
 	struct rpmsg_channel_info chinfo;
-
-	if (cmd != RPMSG_CREATE_EPT_IOCTL)
-		return -EINVAL;
+	struct rpmsg_device *rpdev;
+	int ret = 0;
 
 	if (copy_from_user(&eptinfo, argp, sizeof(eptinfo)))
 		return -EFAULT;
@@ -88,7 +89,33 @@ static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned int cmd,
 	chinfo.src = eptinfo.src;
 	chinfo.dst = eptinfo.dst;
 
-	return rpmsg_chrdev_eptdev_create(ctrldev->rpdev, &ctrldev->dev, chinfo);
+	mutex_lock(&ctrldev->ctrl_lock);
+	switch (cmd) {
+	case RPMSG_CREATE_EPT_IOCTL:
+		ret = rpmsg_chrdev_eptdev_create(ctrldev->rpdev, &ctrldev->dev, chinfo);
+		break;
+
+	case RPMSG_CREATE_DEV_IOCTL:
+		rpdev = rpmsg_create_channel(ctrldev->rpdev, &chinfo);
+		if (!rpdev) {
+			dev_err(&ctrldev->dev, "failed to create %s channel\n", chinfo.name);
+			ret = -ENXIO;
+		}
+		break;
+
+	case RPMSG_RELEASE_DEV_IOCTL:
+		ret = rpmsg_release_channel(ctrldev->rpdev, &chinfo);
+		if (ret)
+			dev_err(&ctrldev->dev, "failed to release %s channel (%d)\n",
+				chinfo.name, ret);
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+	mutex_unlock(&ctrldev->ctrl_lock);
+
+	return ret;
 };
 
 static const struct file_operations rpmsg_ctrldev_fops = {
@@ -125,6 +152,7 @@ static int rpmsg_ctrldev_probe(struct rpmsg_device *rpdev)
 	dev->parent = &rpdev->dev;
 	dev->class = rpmsg_class;
 
+	mutex_init(&ctrldev->ctrl_lock);
 	cdev_init(&ctrldev->cdev, &rpmsg_ctrldev_fops);
 	ctrldev->cdev.owner = THIS_MODULE;
 
diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
index f5ca8740f3fb..1637e68177d9 100644
--- a/include/uapi/linux/rpmsg.h
+++ b/include/uapi/linux/rpmsg.h
@@ -33,4 +33,14 @@ struct rpmsg_endpoint_info {
  */
 #define RPMSG_DESTROY_EPT_IOCTL	_IO(0xb5, 0x2)
 
+/**
+ * Instantiate a new local rpmsg service device.
+ */
+#define RPMSG_CREATE_DEV_IOCTL	_IOW(0xb5, 0x3, struct rpmsg_endpoint_info)
+
+/**
+ * Release a local rpmsg device.
+ */
+#define RPMSG_RELEASE_DEV_IOCTL	_IOW(0xb5, 0x4, struct rpmsg_endpoint_info)
+
 #endif
-- 
cgit v1.2.3


From b5fdf66f6eb2560784c6f60131dc567de06267dc Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Tue, 1 Mar 2022 14:37:27 -0500
Subject: NFS: Remove remaining dfprintks related to fscache and remove
 NFSDBG_FSCACHE

The fscache cookie APIs including fscache_acquire_cookie() and
fscache_relinquish_cookie() now have very good tracing.  Thus,
there is no real need for dfprintks in the NFS fscache interface.

The NFS fscache interface has removed all dfprintks so remove the
NFSDBG_FSCACHE defines.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fscache.c            | 10 ----------
 include/uapi/linux/nfs_fs.h |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 841b69aef189..4dee53ceb941 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -21,8 +21,6 @@
 #include "fscache.h"
 #include "nfstrace.h"
 
-#define NFSDBG_FACILITY		NFSDBG_FSCACHE
-
 #define NFS_MAX_KEY_LEN 1000
 
 static bool nfs_append_int(char *key, int *_len, unsigned long long x)
@@ -129,8 +127,6 @@ int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int u
 	vcookie = fscache_acquire_volume(key,
 					 NULL, /* preferred_cache */
 					 NULL, 0 /* coherency_data */);
-	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
-		 nfss, vcookie);
 	if (IS_ERR(vcookie)) {
 		if (vcookie != ERR_PTR(-EBUSY)) {
 			kfree(key);
@@ -153,9 +149,6 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 {
 	struct nfs_server *nfss = NFS_SB(sb);
 
-	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
-		 nfss, nfss->fscache);
-
 	fscache_relinquish_volume(nfss->fscache, NULL, false);
 	nfss->fscache = NULL;
 	kfree(nfss->fscache_uniq);
@@ -193,8 +186,6 @@ void nfs_fscache_clear_inode(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
-
 	fscache_relinquish_cookie(cookie, false);
 	nfsi->fscache = NULL;
 }
@@ -229,7 +220,6 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 
 	fscache_use_cookie(cookie, open_for_write);
 	if (open_for_write) {
-		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
 		nfs_fscache_update_auxdata(&auxdata, inode);
 		fscache_invalidate(cookie, &auxdata, i_size_read(inode),
 				   FSCACHE_INVAL_DIO_WRITE);
diff --git a/include/uapi/linux/nfs_fs.h b/include/uapi/linux/nfs_fs.h
index 3afe3767c55d..ae0de165c014 100644
--- a/include/uapi/linux/nfs_fs.h
+++ b/include/uapi/linux/nfs_fs.h
@@ -52,7 +52,7 @@
 #define NFSDBG_CALLBACK		0x0100
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
-#define NFSDBG_FSCACHE		0x0800
+#define NFSDBG_FSCACHE		0x0800 /* unused */
 #define NFSDBG_PNFS		0x1000
 #define NFSDBG_PNFS_LD		0x2000
 #define NFSDBG_STATE		0x4000
-- 
cgit v1.2.3


From 2c7d2a230237e7c43fa067d695937b7e484bb92a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 15 Dec 2021 15:39:58 -0500
Subject: btrfs: add definition for EXTENT_TREE_V2

This adds the initial definition of the EXTENT_TREE_V2 incompat feature
flag.  This also hides the support behind CONFIG_BTRFS_DEBUG.

THIS IS A IN DEVELOPMENT FORMAT CHANGE, DO NOT USE UNLESS YOU ARE A
DEVELOPER OR A TESTER.

The format is in flux and will be added in stages, any fs will need to
be re-made between updates to the format.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h           | 21 +++++++++++++++++++++
 fs/btrfs/sysfs.c           |  5 ++++-
 include/uapi/linux/btrfs.h |  1 +
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ebb2d109e8bb..54424b341132 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -300,6 +300,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
 
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
+	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
+	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
+	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
@@ -314,6 +334,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
 	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
 	 BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
 
 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
 	(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index beb7f72d50b8..5f4812fd8b50 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
 BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-/* Remove once support for zoned allocation is feature complete */
 #ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for zoned allocation is feature complete */
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
 #endif
 #ifdef CONFIG_FS_VERITY
 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_FEAT_ATTR_PTR(zoned),
+	BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
 #endif
 #ifdef CONFIG_FS_VERITY
 	BTRFS_FEAT_ATTR_PTR(verity),
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 738619994e26..1cb1a3860f1d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -309,6 +309,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_METADATA_UUID	(1ULL << 10)
 #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
 #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
+#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;
-- 
cgit v1.2.3


From 9c54e80ddc6bd89596a4046d451908700476fd14 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 15 Dec 2021 15:40:07 -0500
Subject: btrfs: add code to support the block group root

This code adds the on disk structures for the block group root, which
will hold the block group items for extent tree v2.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                | 27 ++++++++++++++++++++++-
 fs/btrfs/disk-io.c              | 49 +++++++++++++++++++++++++++++++++++------
 fs/btrfs/disk-io.h              |  2 ++
 fs/btrfs/print-tree.c           |  1 +
 include/trace/events/btrfs.h    |  1 +
 include/uapi/linux/btrfs_tree.h |  3 +++
 6 files changed, 75 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54424b341132..f460a7bb9ae8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -274,8 +274,14 @@ struct btrfs_super_block {
 	/* the UUID written into btree blocks */
 	u8 metadata_uuid[BTRFS_FSID_SIZE];
 
+	/* Extent tree v2 */
+	__le64 block_group_root;
+	__le64 block_group_root_generation;
+	u8 block_group_root_level;
+
 	/* future expansion */
-	__le64 reserved[28];
+	u8 reserved8[7];
+	__le64 reserved[25];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 
@@ -657,6 +663,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *data_reloc_root;
+	struct btrfs_root *block_group_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -2349,6 +2356,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 		   num_devices, 64);
 
+/*
+ * For extent tree v2 we overload the extent root with the block group root, as
+ * we will have multiple extent roots.
+ */
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
+			 extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
+			 extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
+			 struct btrfs_root_backup, extent_root_level, 8);
+
 /* struct btrfs_balance_item */
 BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
@@ -2483,6 +2501,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
+			 block_group_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
+			 struct btrfs_super_block,
+			 block_group_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
+			 block_group_root_level, 8);
 
 int btrfs_super_csum_size(const struct btrfs_super_block *s);
 const char *btrfs_super_csum_name(u16 csum_type);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e014cb8413c7..fe1349737edb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1727,6 +1727,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->uuid_root);
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
+	btrfs_put_root(fs_info->block_group_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -2095,7 +2096,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 {
 	const int next_backup = info->backup_root_index;
 	struct btrfs_root_backup *root_backup;
-	struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
 	struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
 
 	root_backup = info->super_for_commit->super_roots + next_backup;
@@ -2121,11 +2121,23 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_chunk_root_level(root_backup,
 			       btrfs_header_level(info->chunk_root->node));
 
-	btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
-	btrfs_set_backup_extent_root_gen(root_backup,
-			       btrfs_header_generation(extent_root->node));
-	btrfs_set_backup_extent_root_level(root_backup,
-			       btrfs_header_level(extent_root->node));
+	if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+		btrfs_set_backup_block_group_root(root_backup,
+					info->block_group_root->node->start);
+		btrfs_set_backup_block_group_root_gen(root_backup,
+			btrfs_header_generation(info->block_group_root->node));
+		btrfs_set_backup_block_group_root_level(root_backup,
+			btrfs_header_level(info->block_group_root->node));
+	} else {
+		struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+
+		btrfs_set_backup_extent_root(root_backup,
+					     extent_root->node->start);
+		btrfs_set_backup_extent_root_gen(root_backup,
+				btrfs_header_generation(extent_root->node));
+		btrfs_set_backup_extent_root_level(root_backup,
+					btrfs_header_level(extent_root->node));
+	}
 
 	/*
 	 * we might commit during log recovery, which happens before we set
@@ -2269,6 +2281,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->uuid_root);
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
+	free_root_extent_buffers(info->block_group_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2964,8 +2977,20 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
 	gen = btrfs_super_generation(sb);
 	level = btrfs_super_root_level(sb);
 	ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
-	if (ret)
+	if (ret) {
 		btrfs_warn(fs_info, "couldn't read tree root");
+		return ret;
+	}
+
+	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+		return 0;
+
+	bytenr = btrfs_super_block_group_root(sb);
+	gen = btrfs_super_block_group_root_generation(sb);
+	level = btrfs_super_block_group_root_level(sb);
+	ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
+	if (ret)
+		btrfs_warn(fs_info, "couldn't read block group root");
 	return ret;
 }
 
@@ -2978,6 +3003,16 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 	int ret = 0;
 	int i;
 
+	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+		struct btrfs_root *root;
+
+		root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
+					GFP_KERNEL);
+		if (!root)
+			return -ENOMEM;
+		fs_info->block_group_root = root;
+	}
+
 	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
 		if (handle_error) {
 			if (!IS_ERR(tree_root->node))
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5e8bef4b7563..2e10514ecda8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -111,6 +111,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
+	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+		return fs_info->block_group_root;
 	return btrfs_extent_root(fs_info, 0);
 }
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0775ae9f4419..524fdb0ddd74 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = {
 	{ BTRFS_QUOTA_TREE_OBJECTID,		"QUOTA_TREE"		},
 	{ BTRFS_UUID_TREE_OBJECTID,		"UUID_TREE"		},
 	{ BTRFS_FREE_SPACE_TREE_OBJECTID,	"FREE_SPACE_TREE"	},
+	{ BTRFS_BLOCK_GROUP_TREE_OBJECTID,	"BLOCK_GROUP_TREE"	},
 	{ BTRFS_DATA_RELOC_TREE_OBJECTID,	"DATA_RELOC_TREE"	},
 };
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 0d729664b4b4..f068ff30d654 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -53,6 +53,7 @@ struct btrfs_space_info;
 		{ BTRFS_TREE_RELOC_OBJECTID,	"TREE_RELOC"	},	\
 		{ BTRFS_UUID_TREE_OBJECTID,	"UUID_TREE"	},	\
 		{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },	\
+		{ BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },\
 		{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
 
 #define show_root_type(obj)						\
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 5416f1f1a77a..b069752a8ecf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -53,6 +53,9 @@
 /* tracks free space in block groups. */
 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
 
+/* Holds the block group items for extent tree v2. */
+#define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
-- 
cgit v1.2.3


From dcb77a9ae87dc1ae2c54ea2e629da357e694b664 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 16 Aug 2021 15:58:29 -0700
Subject: btrfs: add definitions and documentation for encoded I/O ioctls

In order to allow sending and receiving compressed data without
decompressing it, we need an interface to write pre-compressed data
directly to the filesystem and the matching interface to read compressed
data without decompressing it. This adds the definitions for ioctls to
do that and detailed explanations of how to use them.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs.h | 132 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 1cb1a3860f1d..d956b2993970 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -869,6 +869,134 @@ struct btrfs_ioctl_get_subvol_rootref_args {
 		__u8 align[7];
 };
 
+/*
+ * Data and metadata for an encoded read or write.
+ *
+ * Encoded I/O bypasses any encoding automatically done by the filesystem (e.g.,
+ * compression). This can be used to read the compressed contents of a file or
+ * write pre-compressed data directly to a file.
+ *
+ * BTRFS_IOC_ENCODED_READ and BTRFS_IOC_ENCODED_WRITE are essentially
+ * preadv/pwritev with additional metadata about how the data is encoded and the
+ * size of the unencoded data.
+ *
+ * BTRFS_IOC_ENCODED_READ fills the given iovecs with the encoded data, fills
+ * the metadata fields, and returns the size of the encoded data. It reads one
+ * extent per call. It can also read data which is not encoded.
+ *
+ * BTRFS_IOC_ENCODED_WRITE uses the metadata fields, writes the encoded data
+ * from the iovecs, and returns the size of the encoded data. Note that the
+ * encoded data is not validated when it is written; if it is not valid (e.g.,
+ * it cannot be decompressed), then a subsequent read may return an error.
+ *
+ * Since the filesystem page cache contains decoded data, encoded I/O bypasses
+ * the page cache. Encoded I/O requires CAP_SYS_ADMIN.
+ */
+struct btrfs_ioctl_encoded_io_args {
+	/* Input parameters for both reads and writes. */
+
+	/*
+	 * iovecs containing encoded data.
+	 *
+	 * For reads, if the size of the encoded data is larger than the sum of
+	 * iov[n].iov_len for 0 <= n < iovcnt, then the ioctl fails with
+	 * ENOBUFS.
+	 *
+	 * For writes, the size of the encoded data is the sum of iov[n].iov_len
+	 * for 0 <= n < iovcnt. This must be less than 128 KiB (this limit may
+	 * increase in the future). This must also be less than or equal to
+	 * unencoded_len.
+	 */
+	const struct iovec __user *iov;
+	/* Number of iovecs. */
+	unsigned long iovcnt;
+	/*
+	 * Offset in file.
+	 *
+	 * For writes, must be aligned to the sector size of the filesystem.
+	 */
+	__s64 offset;
+	/* Currently must be zero. */
+	__u64 flags;
+
+	/*
+	 * For reads, the following members are output parameters that will
+	 * contain the returned metadata for the encoded data.
+	 * For writes, the following members must be set to the metadata for the
+	 * encoded data.
+	 */
+
+	/*
+	 * Length of the data in the file.
+	 *
+	 * Must be less than or equal to unencoded_len - unencoded_offset. For
+	 * writes, must be aligned to the sector size of the filesystem unless
+	 * the data ends at or beyond the current end of the file.
+	 */
+	__u64 len;
+	/*
+	 * Length of the unencoded (i.e., decrypted and decompressed) data.
+	 *
+	 * For writes, must be no more than 128 KiB (this limit may increase in
+	 * the future). If the unencoded data is actually longer than
+	 * unencoded_len, then it is truncated; if it is shorter, then it is
+	 * extended with zeroes.
+	 */
+	__u64 unencoded_len;
+	/*
+	 * Offset from the first byte of the unencoded data to the first byte of
+	 * logical data in the file.
+	 *
+	 * Must be less than unencoded_len.
+	 */
+	__u64 unencoded_offset;
+	/*
+	 * BTRFS_ENCODED_IO_COMPRESSION_* type.
+	 *
+	 * For writes, must not be BTRFS_ENCODED_IO_COMPRESSION_NONE.
+	 */
+	__u32 compression;
+	/* Currently always BTRFS_ENCODED_IO_ENCRYPTION_NONE. */
+	__u32 encryption;
+	/*
+	 * Reserved for future expansion.
+	 *
+	 * For reads, always returned as zero. Users should check for non-zero
+	 * bytes. If there are any, then the kernel has a newer version of this
+	 * structure with additional information that the user definition is
+	 * missing.
+	 *
+	 * For writes, must be zeroed.
+	 */
+	__u8 reserved[64];
+};
+
+/* Data is not compressed. */
+#define BTRFS_ENCODED_IO_COMPRESSION_NONE 0
+/* Data is compressed as a single zlib stream. */
+#define BTRFS_ENCODED_IO_COMPRESSION_ZLIB 1
+/*
+ * Data is compressed as a single zstd frame with the windowLog compression
+ * parameter set to no more than 17.
+ */
+#define BTRFS_ENCODED_IO_COMPRESSION_ZSTD 2
+/*
+ * Data is compressed sector by sector (using the sector size indicated by the
+ * name of the constant) with LZO1X and wrapped in the format documented in
+ * fs/btrfs/lzo.c. For writes, the compression sector size must match the
+ * filesystem sector size.
+ */
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_4K 3
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_8K 4
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_16K 5
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_32K 6
+#define BTRFS_ENCODED_IO_COMPRESSION_LZO_64K 7
+#define BTRFS_ENCODED_IO_COMPRESSION_TYPES 8
+
+/* Data is not encrypted. */
+#define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0
+#define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -997,5 +1125,9 @@ enum btrfs_err_code {
 				struct btrfs_ioctl_ino_lookup_user_args)
 #define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \
 				struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_ENCODED_READ _IOR(BTRFS_IOCTL_MAGIC, 64, \
+				    struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
+				     struct btrfs_ioctl_encoded_io_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
cgit v1.2.3


From 65722ff6181aa52c3d5b0929004af22a3a63e148 Mon Sep 17 00:00:00 2001
From: David Yat Sin <david.yatsin@amd.com>
Date: Tue, 8 Mar 2022 14:00:50 -0500
Subject: drm/amdkfd: CRIU export dmabuf handles for GTT BOs

Export dmabuf handles for GTT BOs so that their contents can be accessed
using SDMA during checkpoint/restore.

v2: Squash in fix from David to set dmabuf handle to invalid for BOs
that cannot be accessed using SDMA during checkpoint/restore.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Reviewed-by : Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 18 ++++++++++++++----
 include/uapi/linux/kfd_ioctl.h           |  5 ++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2c7d76e67ddb..607f65ab39ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1759,14 +1759,18 @@ static int criu_checkpoint_bos(struct kfd_process *p,
 					goto exit;
 				}
 			}
-			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+			if (bo_bucket->alloc_flags
+			    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
 				ret = criu_get_prime_handle(&dumper_bo->tbo.base,
 						bo_bucket->alloc_flags &
 						KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
 						&bo_bucket->dmabuf_fd);
 				if (ret)
 					goto exit;
+			} else {
+				bo_bucket->dmabuf_fd = KFD_INVALID_FD;
 			}
+
 			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
 				bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
 					KFD_MMAP_GPU_ID(pdd->dev->id);
@@ -1812,7 +1816,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
 
 exit:
 	while (ret && bo_index--) {
-		if (bo_buckets[bo_index].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+		if (bo_buckets[bo_index].alloc_flags
+		    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
 			close_fd(bo_buckets[bo_index].dmabuf_fd);
 	}
 
@@ -2211,12 +2216,16 @@ static int criu_restore_bo(struct kfd_process *p,
 
 	pr_debug("map memory was successful for the BO\n");
 	/* create the dmabuf object and export the bo */
-	if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+	if (bo_bucket->alloc_flags
+	    & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
 		ret = criu_get_prime_handle(&kgd_mem->bo->tbo.base, DRM_RDWR,
 					    &bo_bucket->dmabuf_fd);
 		if (ret)
 			return ret;
+	} else {
+		bo_bucket->dmabuf_fd = KFD_INVALID_FD;
 	}
+
 	return 0;
 }
 
@@ -2281,7 +2290,8 @@ static int criu_restore_bos(struct kfd_process *p,
 
 exit:
 	while (ret && i--) {
-		if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+		if (bo_buckets[i].alloc_flags
+		   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
 			close_fd(bo_buckets[i].dmabuf_fd);
 	}
 	kvfree(bo_buckets);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b40687bf1014..42975e940758 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -33,9 +33,10 @@
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
  * - 1.7 - Checkpoint Restore (CRIU) API
+ * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 7
+#define KFD_IOCTL_MINOR_VERSION 8
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -195,6 +196,8 @@ struct kfd_ioctl_dbg_wave_control_args {
 	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
 };
 
+#define KFD_INVALID_FD     0xffffffff
+
 /* Matching HSA_EVENTTYPE */
 #define KFD_IOC_EVENT_SIGNAL			0
 #define KFD_IOC_EVENT_NODECHANGE		1
-- 
cgit v1.2.3


From 435fe1c0c1f74b682dba85641406abf4337aade6 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Wed, 16 Mar 2022 08:15:57 +0200
Subject: net: geneve: support IPv4/IPv6 as inner protocol

This patch adds support for encapsulating IPv4/IPv6 within GENEVE.

In order to use this, a new IFLA_GENEVE_INNER_PROTO_INHERIT flag needs
to be provided at device creation. This property cannot be changed for
the time being.

In case IP traffic is received on a non-tun device the drop count is
increased.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Link: https://lore.kernel.org/r/20220316061557.431872-1-eyal.birger@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/geneve.c         | 82 ++++++++++++++++++++++++++++++++++----------
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 64 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a895ff756093..8f30660224c5 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -56,6 +56,7 @@ struct geneve_config {
 	bool			use_udp6_rx_checksums;
 	bool			ttl_inherit;
 	enum ifla_geneve_df	df;
+	bool			inner_proto_inherit;
 };
 
 /* Pseudo network device */
@@ -251,17 +252,24 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 		}
 	}
 
-	skb_reset_mac_header(skb);
-	skb->protocol = eth_type_trans(skb, geneve->dev);
-	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-
 	if (tun_dst)
 		skb_dst_set(skb, &tun_dst->dst);
 
-	/* Ignore packet loops (and multicast echo) */
-	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
-		geneve->dev->stats.rx_errors++;
-		goto drop;
+	if (gnvh->proto_type == htons(ETH_P_TEB)) {
+		skb_reset_mac_header(skb);
+		skb->protocol = eth_type_trans(skb, geneve->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+		/* Ignore packet loops (and multicast echo) */
+		if (ether_addr_equal(eth_hdr(skb)->h_source,
+				     geneve->dev->dev_addr)) {
+			geneve->dev->stats.rx_errors++;
+			goto drop;
+		}
+	} else {
+		skb_reset_mac_header(skb);
+		skb->dev = geneve->dev;
+		skb->pkt_type = PACKET_HOST;
 	}
 
 	oiph = skb_network_header(skb);
@@ -345,6 +353,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	struct genevehdr *geneveh;
 	struct geneve_dev *geneve;
 	struct geneve_sock *gs;
+	__be16 inner_proto;
 	int opts_len;
 
 	/* Need UDP and Geneve header to be present */
@@ -356,7 +365,11 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(geneveh->ver != GENEVE_VER))
 		goto drop;
 
-	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+	inner_proto = geneveh->proto_type;
+
+	if (unlikely((inner_proto != htons(ETH_P_TEB) &&
+		      inner_proto != htons(ETH_P_IP) &&
+		      inner_proto != htons(ETH_P_IPV6))))
 		goto drop;
 
 	gs = rcu_dereference_sk_user_data(sk);
@@ -367,9 +380,14 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!geneve)
 		goto drop;
 
+	if (unlikely((!geneve->cfg.inner_proto_inherit &&
+		      inner_proto != htons(ETH_P_TEB)))) {
+		geneve->dev->stats.rx_dropped++;
+		goto drop;
+	}
+
 	opts_len = geneveh->opt_len * 4;
-	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
-				 htons(ETH_P_TEB),
+	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
 				 !net_eq(geneve->net, dev_net(geneve->dev)))) {
 		geneve->dev->stats.rx_dropped++;
 		goto drop;
@@ -717,7 +735,8 @@ static int geneve_stop(struct net_device *dev)
 }
 
 static void geneve_build_header(struct genevehdr *geneveh,
-				const struct ip_tunnel_info *info)
+				const struct ip_tunnel_info *info,
+				__be16 inner_proto)
 {
 	geneveh->ver = GENEVE_VER;
 	geneveh->opt_len = info->options_len / 4;
@@ -725,7 +744,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
 	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
 	geneveh->rsvd1 = 0;
 	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
-	geneveh->proto_type = htons(ETH_P_TEB);
+	geneveh->proto_type = inner_proto;
 	geneveh->rsvd2 = 0;
 
 	if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
@@ -734,10 +753,12 @@ static void geneve_build_header(struct genevehdr *geneveh,
 
 static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 			    const struct ip_tunnel_info *info,
-			    bool xnet, int ip_hdr_len)
+			    bool xnet, int ip_hdr_len,
+			    bool inner_proto_inherit)
 {
 	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 	struct genevehdr *gnvh;
+	__be16 inner_proto;
 	int min_headroom;
 	int err;
 
@@ -755,8 +776,9 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 		goto free_dst;
 
 	gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
-	geneve_build_header(gnvh, info);
-	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+	inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB);
+	geneve_build_header(gnvh, info, inner_proto);
+	skb_set_inner_protocol(skb, inner_proto);
 	return 0;
 
 free_dst:
@@ -959,7 +981,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
+	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
+			       geneve->cfg.inner_proto_inherit);
 	if (unlikely(err))
 		return err;
 
@@ -1038,7 +1061,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			ttl = key->ttl;
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
-	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
+	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
+			       geneve->cfg.inner_proto_inherit);
 	if (unlikely(err))
 		return err;
 
@@ -1388,6 +1412,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	dst_cache_reset(&geneve->cfg.info.dst_cache);
 	memcpy(&geneve->cfg, cfg, sizeof(*cfg));
 
+	if (geneve->cfg.inner_proto_inherit) {
+		dev->header_ops = NULL;
+		dev->type = ARPHRD_NONE;
+		dev->hard_header_len = 0;
+		dev->addr_len = 0;
+		dev->flags = IFF_NOARP;
+	}
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
@@ -1561,10 +1593,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 #endif
 	}
 
+	if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) {
+		if (changelink) {
+			attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT;
+			goto change_notsup;
+		}
+		cfg->inner_proto_inherit = true;
+	}
+
 	return 0;
 change_notsup:
 	NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
-			    "Changing VNI, Port, endpoint IP address family, external, and UDP checksum attributes are not supported");
+			    "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported");
 	return -EOPNOTSUPP;
 }
 
@@ -1799,6 +1839,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
 		goto nla_put_failure;
 
+	if (geneve->cfg.inner_proto_inherit &&
+	    nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bd24c7dc10a2..cc284c048e69 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -842,6 +842,7 @@ enum {
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
 	IFLA_GENEVE_DF,
+	IFLA_GENEVE_INNER_PROTO_INHERIT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From ec7328b59176227216c461601c6bd0e922232a9b Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:43 +0100
Subject: net: bridge: mst: Multiple Spanning Tree (MST) mode

Allow the user to switch from the current per-VLAN STP mode to an MST
mode.

Up to this point, per-VLAN STP states where always isolated from each
other. This is in contrast to the MSTP standard (802.1Q-2018, Clause
13.5), where VLANs are grouped into MST instances (MSTIs), and the
state is managed on a per-MSTI level, rather that at the per-VLAN
level.

Perhaps due to the prevalence of the standard, many switching ASICs
are built after the same model. Therefore, add a corresponding MST
mode to the bridge, which we can later add offloading support for in a
straight-forward way.

For now, all VLANs are fixed to MSTI 0, also called the Common
Spanning Tree (CST). That is, all VLANs will follow the port-global
state.

Upcoming changes will make this actually useful by allowing VLANs to
be mapped to arbitrary MSTIs and allow individual MSTI states to be
changed.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/Makefile            |  2 +-
 net/bridge/br.c                |  5 +++
 net/bridge/br_input.c          | 17 ++++++++-
 net/bridge/br_mst.c            | 87 ++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_private.h        | 37 ++++++++++++++++++
 net/bridge/br_stp.c            |  6 +++
 net/bridge/br_vlan.c           | 20 +++++++++-
 net/bridge/br_vlan_options.c   |  5 +++
 9 files changed, 176 insertions(+), 4 deletions(-)
 create mode 100644 net/bridge/br_mst.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2711c3522010..30a242195ced 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -759,6 +759,7 @@ struct br_mcast_stats {
 enum br_boolopt_id {
 	BR_BOOLOPT_NO_LL_LEARN,
 	BR_BOOLOPT_MCAST_VLAN_SNOOPING,
+	BR_BOOLOPT_MST_ENABLE,
 	BR_BOOLOPT_MAX
 };
 
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index 7fb9a021873b..24bd1c0a9a5a 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
 bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o br_multicast_eht.o
 
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o br_mst.o
 
 bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b1dea3febeea..96e91d69a9a8 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -265,6 +265,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
 	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
 		err = br_multicast_toggle_vlan_snooping(br, on, extack);
 		break;
+	case BR_BOOLOPT_MST_ENABLE:
+		err = br_mst_set_enabled(br, on, extack);
+		break;
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
@@ -281,6 +284,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
 		return br_opt_get(br, BROPT_NO_LL_LEARN);
 	case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
 		return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
+	case BR_BOOLOPT_MST_ENABLE:
+		return br_opt_get(br, BROPT_MST_ENABLED);
 	default:
 		/* shouldn't be called with unsupported options */
 		WARN_ON(1);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index e0c13fcc50ed..196417859c4a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -78,13 +78,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	u16 vid = 0;
 	u8 state;
 
-	if (!p || p->state == BR_STATE_DISABLED)
+	if (!p)
 		goto drop;
 
 	br = p->br;
+
+	if (br_mst_is_enabled(br)) {
+		state = BR_STATE_FORWARDING;
+	} else {
+		if (p->state == BR_STATE_DISABLED)
+			goto drop;
+
+		state = p->state;
+	}
+
 	brmctx = &p->br->multicast_ctx;
 	pmctx = &p->multicast_ctx;
-	state = p->state;
 	if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
 				&state, &vlan))
 		goto out;
@@ -370,9 +379,13 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		return RX_HANDLER_PASS;
 
 forward:
+	if (br_mst_is_enabled(p->br))
+		goto defer_stp_filtering;
+
 	switch (p->state) {
 	case BR_STATE_FORWARDING:
 	case BR_STATE_LEARNING:
+defer_stp_filtering:
 		if (ether_addr_equal(p->br->dev->dev_addr, dest))
 			skb->pkt_type = PACKET_HOST;
 
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
new file mode 100644
index 000000000000..0f9f596f86bc
--- /dev/null
+++ b/net/bridge/br_mst.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	Bridge Multiple Spanning Tree Support
+ *
+ *	Authors:
+ *	Tobias Waldekranz		<tobias@waldekranz.com>
+ */
+
+#include <linux/kernel.h>
+
+#include "br_private.h"
+
+DEFINE_STATIC_KEY_FALSE(br_mst_used);
+
+static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v,
+				  u8 state)
+{
+	struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+
+	if (v->state == state)
+		return;
+
+	br_vlan_set_state(v, state);
+
+	if (v->vid == vg->pvid)
+		br_vlan_set_pvid_state(vg, state);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+		     struct netlink_ext_ack *extack)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+
+	vg = nbp_vlan_group(p);
+	if (!vg)
+		return 0;
+
+	list_for_each_entry(v, &vg->vlan_list, vlist) {
+		if (v->brvlan->msti != msti)
+			continue;
+
+		br_mst_vlan_set_state(p, v, state);
+	}
+
+	return 0;
+}
+
+void br_mst_vlan_init_state(struct net_bridge_vlan *v)
+{
+	/* VLANs always start out in MSTI 0 (CST) */
+	v->msti = 0;
+
+	if (br_vlan_is_master(v))
+		v->state = BR_STATE_FORWARDING;
+	else
+		v->state = v->port->state;
+}
+
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+		       struct netlink_ext_ack *extack)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		vg = nbp_vlan_group(p);
+
+		if (!vg->num_vlans)
+			continue;
+
+		NL_SET_ERR_MSG(extack,
+			       "MST mode can't be changed while VLANs exist");
+		return -EBUSY;
+	}
+
+	if (br_opt_get(br, BROPT_MST_ENABLED) == on)
+		return 0;
+
+	if (on)
+		static_branch_enable(&br_mst_used);
+	else
+		static_branch_disable(&br_mst_used);
+
+	br_opt_toggle(br, BROPT_MST_ENABLED, on);
+	return 0;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 48bc61ebc211..c2190c8841fb 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -178,6 +178,7 @@ enum {
  * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context
  * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast
  *                  context
+ * @msti: if MASTER flag set, this holds the VLANs MST instance
  * @vlist: sorted list of VLAN entries
  * @rcu: used for entry destruction
  *
@@ -210,6 +211,8 @@ struct net_bridge_vlan {
 		struct net_bridge_mcast_port	port_mcast_ctx;
 	};
 
+	u16				msti;
+
 	struct list_head		vlist;
 
 	struct rcu_head			rcu;
@@ -445,6 +448,7 @@ enum net_bridge_opts {
 	BROPT_NO_LL_LEARN,
 	BROPT_VLAN_BRIDGE_BINDING,
 	BROPT_MCAST_VLAN_SNOOPING_ENABLED,
+	BROPT_MST_ENABLED,
 };
 
 struct net_bridge {
@@ -1765,6 +1769,39 @@ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow)
 }
 #endif
 
+/* br_mst.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+DECLARE_STATIC_KEY_FALSE(br_mst_used);
+static inline bool br_mst_is_enabled(struct net_bridge *br)
+{
+	return static_branch_unlikely(&br_mst_used) &&
+		br_opt_get(br, BROPT_MST_ENABLED);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+		     struct netlink_ext_ack *extack);
+void br_mst_vlan_init_state(struct net_bridge_vlan *v);
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+		       struct netlink_ext_ack *extack);
+#else
+static inline bool br_mst_is_enabled(struct net_bridge *br)
+{
+	return false;
+}
+
+static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti,
+				   u8 state, struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int br_mst_set_enabled(struct net_bridge *br, bool on,
+				     struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 struct nf_br_ops {
 	int (*br_dev_xmit_hook)(struct sk_buff *skb);
 };
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 1d80f34a139c..7d27b2e6038f 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -43,6 +43,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 		return;
 
 	p->state = state;
+	if (br_opt_get(p->br, BROPT_MST_ENABLED)) {
+		err = br_mst_set_state(p, 0, state, NULL);
+		if (err)
+			br_warn(p->br, "error setting MST state on port %u(%s)\n",
+				p->port_no, netdev_name(p->dev));
+	}
 	err = switchdev_port_attr_set(p->dev, &attr, NULL);
 	if (err && err != -EOPNOTSUPP)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 7557e90b60e1..0f5e75ccac79 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -226,6 +226,24 @@ static void nbp_vlan_rcu_free(struct rcu_head *rcu)
 	kfree(v);
 }
 
+static void br_vlan_init_state(struct net_bridge_vlan *v)
+{
+	struct net_bridge *br;
+
+	if (br_vlan_is_master(v))
+		br = v->br;
+	else
+		br = v->port->br;
+
+	if (br_opt_get(br, BROPT_MST_ENABLED)) {
+		br_mst_vlan_init_state(v);
+		return;
+	}
+
+	v->state = BR_STATE_FORWARDING;
+	v->msti = 0;
+}
+
 /* This is the shared VLAN add function which works for both ports and bridge
  * devices. There are four possible calls to this function in terms of the
  * vlan entry type:
@@ -322,7 +340,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
 	}
 
 	/* set the state before publishing */
-	v->state = BR_STATE_FORWARDING;
+	br_vlan_init_state(v);
 
 	err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
 					    br_vlan_rht_params);
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a6382973b3e7..09112b56e79c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -99,6 +99,11 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
 		return -EBUSY;
 	}
 
+	if (br_opt_get(br, BROPT_MST_ENABLED)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled");
+		return -EBUSY;
+	}
+
 	if (v->state == state)
 		return 0;
 
-- 
cgit v1.2.3


From 8c678d60562f3e5f6d0a5f5465e27930ffedb8ca Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:44 +0100
Subject: net: bridge: mst: Allow changing a VLAN's MSTI

Allow a VLAN to move out of the CST (MSTI 0), to an independent tree.

The user manages the VID to MSTI mappings via a global VLAN
setting. The proposed iproute2 interface would be:

    bridge vlan global set dev br0 vid <VID> msti <MSTI>

Changing the state in non-zero MSTIs is still not supported, but will
be addressed in upcoming changes.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_mst.c            | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_private.h        |  1 +
 net/bridge/br_vlan_options.c   | 15 +++++++++++++++
 4 files changed, 59 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 30a242195ced..f60244b747ae 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -564,6 +564,7 @@ enum {
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
 	BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS,
 	BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE,
+	BRIDGE_VLANDB_GOPTS_MSTI,
 	__BRIDGE_VLANDB_GOPTS_MAX
 };
 #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1)
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
index 0f9f596f86bc..d7a7b5d7ddb3 100644
--- a/net/bridge/br_mst.c
+++ b/net/bridge/br_mst.c
@@ -46,6 +46,48 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
 	return 0;
 }
 
+static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti)
+{
+	struct net_bridge_vlan_group *vg = nbp_vlan_group(pv->port);
+	struct net_bridge_vlan *v;
+
+	list_for_each_entry(v, &vg->vlan_list, vlist) {
+		/* If this port already has a defined state in this
+		 * MSTI (through some other VLAN membership), inherit
+		 * it.
+		 */
+		if (v != pv && v->brvlan->msti == msti) {
+			br_mst_vlan_set_state(pv->port, pv, v->state);
+			return;
+		}
+	}
+
+	/* Otherwise, start out in a new MSTI with all ports disabled. */
+	return br_mst_vlan_set_state(pv->port, pv, BR_STATE_DISABLED);
+}
+
+int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *pv;
+	struct net_bridge_port *p;
+
+	if (mv->msti == msti)
+		return 0;
+
+	mv->msti = msti;
+
+	list_for_each_entry(p, &mv->br->port_list, list) {
+		vg = nbp_vlan_group(p);
+
+		pv = br_vlan_find(vg, mv->vid);
+		if (pv)
+			br_mst_vlan_sync_state(pv, msti);
+	}
+
+	return 0;
+}
+
 void br_mst_vlan_init_state(struct net_bridge_vlan *v)
 {
 	/* VLANs always start out in MSTI 0 (CST) */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c2190c8841fb..3978e1d9ffb5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1780,6 +1780,7 @@ static inline bool br_mst_is_enabled(struct net_bridge *br)
 
 int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
 		     struct netlink_ext_ack *extack);
+int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti);
 void br_mst_vlan_init_state(struct net_bridge_vlan *v);
 int br_mst_set_enabled(struct net_bridge *br, bool on,
 		       struct netlink_ext_ack *extack);
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index 09112b56e79c..a2724d03278c 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -296,6 +296,7 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
 					 const struct net_bridge_vlan *r_end)
 {
 	return v_curr->vid - r_end->vid == 1 &&
+		v_curr->msti == r_end->msti &&
 	       ((v_curr->priv_flags ^ r_end->priv_flags) &
 		BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 &&
 		br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx,
@@ -384,6 +385,9 @@ bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
 #endif
 #endif
 
+	if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti))
+		goto out_err;
+
 	nla_nest_end(skb, nest);
 
 	return true;
@@ -415,6 +419,7 @@ static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v)
 		+ nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
 		+ br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
 #endif
+		+ nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */
 		+ nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
 }
 
@@ -564,6 +569,15 @@ static int br_vlan_process_global_one_opts(const struct net_bridge *br,
 	}
 #endif
 #endif
+	if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) {
+		u16 msti;
+
+		msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]);
+		err = br_mst_vlan_set_msti(v, msti);
+		if (err)
+			return err;
+		*changed = true;
+	}
 
 	return 0;
 }
@@ -583,6 +597,7 @@ static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]	= { .type = NLA_U64 },
 	[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
+	[BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1),
 };
 
 int br_vlan_rtm_process_global_options(struct net_device *dev,
-- 
cgit v1.2.3


From 122c29486e1ff78033c45d0d31c710e7dc8945a5 Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:45 +0100
Subject: net: bridge: mst: Support setting and reporting MST port states

Make it possible to change the port state in a given MSTI by extending
the bridge port netlink interface (RTM_SETLINK on PF_BRIDGE).The
proposed iproute2 interface would be:

    bridge mst set dev <PORT> msti <MSTI> state <STATE>

Current states in all applicable MSTIs can also be dumped via a
corresponding RTM_GETLINK. The proposed iproute interface looks like
this:

$ bridge mst
port              msti
vb1               0
		    state forwarding
		  100
		    state disabled
vb2               0
		    state forwarding
		  100
		    state forwarding

The preexisting per-VLAN states are still valid in the MST
mode (although they are read-only), and can be queried as usual if one
is interested in knowing a particular VLAN's state without having to
care about the VID to MSTI mapping (in this example VLAN 20 and 30 are
bound to MSTI 100):

$ bridge -d vlan
port              vlan-id
vb1               10
		    state forwarding mcast_router 1
		  20
		    state disabled mcast_router 1
		  30
		    state disabled mcast_router 1
		  40
		    state forwarding mcast_router 1
vb2               10
		    state forwarding mcast_router 1
		  20
		    state forwarding mcast_router 1
		  30
		    state forwarding mcast_router 1
		  40
		    state forwarding mcast_router 1

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_bridge.h |  16 ++++++
 include/uapi/linux/rtnetlink.h |   1 +
 net/bridge/br_mst.c            | 126 +++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_netlink.c        |  44 +++++++++++++-
 net/bridge/br_private.h        |  23 ++++++++
 5 files changed, 209 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index f60244b747ae..a86a7e7b811f 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -122,6 +122,7 @@ enum {
 	IFLA_BRIDGE_VLAN_TUNNEL_INFO,
 	IFLA_BRIDGE_MRP,
 	IFLA_BRIDGE_CFM,
+	IFLA_BRIDGE_MST,
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
@@ -453,6 +454,21 @@ enum {
 
 #define IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX (__IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX - 1)
 
+enum {
+	IFLA_BRIDGE_MST_UNSPEC,
+	IFLA_BRIDGE_MST_ENTRY,
+	__IFLA_BRIDGE_MST_MAX,
+};
+#define IFLA_BRIDGE_MST_MAX (__IFLA_BRIDGE_MST_MAX - 1)
+
+enum {
+	IFLA_BRIDGE_MST_ENTRY_UNSPEC,
+	IFLA_BRIDGE_MST_ENTRY_MSTI,
+	IFLA_BRIDGE_MST_ENTRY_STATE,
+	__IFLA_BRIDGE_MST_ENTRY_MAX,
+};
+#define IFLA_BRIDGE_MST_ENTRY_MAX (__IFLA_BRIDGE_MST_ENTRY_MAX - 1)
+
 struct bridge_stp_xstats {
 	__u64 transition_blk;
 	__u64 transition_fwd;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 51530aade46e..83849a37db5b 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -817,6 +817,7 @@ enum {
 #define RTEXT_FILTER_MRP	(1 << 4)
 #define RTEXT_FILTER_CFM_CONFIG	(1 << 5)
 #define RTEXT_FILTER_CFM_STATUS	(1 << 6)
+#define RTEXT_FILTER_MST	(1 << 7)
 
 /* End of information exported to user level */
 
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
index d7a7b5d7ddb3..5c1831c73fc2 100644
--- a/net/bridge/br_mst.c
+++ b/net/bridge/br_mst.c
@@ -127,3 +127,129 @@ int br_mst_set_enabled(struct net_bridge *br, bool on,
 	br_opt_toggle(br, BROPT_MST_ENABLED, on);
 	return 0;
 }
+
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+	DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+	const struct net_bridge_vlan *v;
+	size_t sz;
+
+	/* IFLA_BRIDGE_MST */
+	sz = nla_total_size(0);
+
+	list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+		if (test_bit(v->brvlan->msti, seen))
+			continue;
+
+		/* IFLA_BRIDGE_MST_ENTRY */
+		sz += nla_total_size(0) +
+			/* IFLA_BRIDGE_MST_ENTRY_MSTI */
+			nla_total_size(sizeof(u16)) +
+			/* IFLA_BRIDGE_MST_ENTRY_STATE */
+			nla_total_size(sizeof(u8));
+
+		__set_bit(v->brvlan->msti, seen);
+	}
+
+	return sz;
+}
+
+int br_mst_fill_info(struct sk_buff *skb,
+		     const struct net_bridge_vlan_group *vg)
+{
+	DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+	const struct net_bridge_vlan *v;
+	struct nlattr *nest;
+	int err = 0;
+
+	list_for_each_entry(v, &vg->vlan_list, vlist) {
+		if (test_bit(v->brvlan->msti, seen))
+			continue;
+
+		nest = nla_nest_start_noflag(skb, IFLA_BRIDGE_MST_ENTRY);
+		if (!nest ||
+		    nla_put_u16(skb, IFLA_BRIDGE_MST_ENTRY_MSTI, v->brvlan->msti) ||
+		    nla_put_u8(skb, IFLA_BRIDGE_MST_ENTRY_STATE, v->state)) {
+			err = -EMSGSIZE;
+			break;
+		}
+		nla_nest_end(skb, nest);
+
+		__set_bit(v->brvlan->msti, seen);
+	}
+
+	return err;
+}
+
+static const struct nla_policy br_mst_nl_policy[IFLA_BRIDGE_MST_ENTRY_MAX + 1] = {
+	[IFLA_BRIDGE_MST_ENTRY_MSTI] = NLA_POLICY_RANGE(NLA_U16,
+						   1, /* 0 reserved for CST */
+						   VLAN_N_VID - 1),
+	[IFLA_BRIDGE_MST_ENTRY_STATE] = NLA_POLICY_RANGE(NLA_U8,
+						    BR_STATE_DISABLED,
+						    BR_STATE_BLOCKING),
+};
+
+static int br_mst_process_one(struct net_bridge_port *p,
+			      const struct nlattr *attr,
+			      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFLA_BRIDGE_MST_ENTRY_MAX + 1];
+	u16 msti;
+	u8 state;
+	int err;
+
+	err = nla_parse_nested(tb, IFLA_BRIDGE_MST_ENTRY_MAX, attr,
+			       br_mst_nl_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[IFLA_BRIDGE_MST_ENTRY_MSTI]) {
+		NL_SET_ERR_MSG_MOD(extack, "MSTI not specified");
+		return -EINVAL;
+	}
+
+	if (!tb[IFLA_BRIDGE_MST_ENTRY_STATE]) {
+		NL_SET_ERR_MSG_MOD(extack, "State not specified");
+		return -EINVAL;
+	}
+
+	msti = nla_get_u16(tb[IFLA_BRIDGE_MST_ENTRY_MSTI]);
+	state = nla_get_u8(tb[IFLA_BRIDGE_MST_ENTRY_STATE]);
+
+	return br_mst_set_state(p, msti, state, extack);
+}
+
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+		   struct netlink_ext_ack *extack)
+{
+	struct nlattr *attr;
+	int err, msts = 0;
+	int rem;
+
+	if (!br_opt_get(p->br, BROPT_MST_ENABLED)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can't modify MST state when MST is disabled");
+		return -EBUSY;
+	}
+
+	nla_for_each_nested(attr, mst_attr, rem) {
+		switch (nla_type(attr)) {
+		case IFLA_BRIDGE_MST_ENTRY:
+			err = br_mst_process_one(p, attr, extack);
+			break;
+		default:
+			continue;
+		}
+
+		msts++;
+		if (err)
+			break;
+	}
+
+	if (!msts) {
+		NL_SET_ERR_MSG_MOD(extack, "Found no MST entries to process");
+		err = -EINVAL;
+	}
+
+	return err;
+}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 7d4432ca9a20..a8d90fa8621e 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -119,6 +119,9 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 	/* Each VLAN is returned in bridge_vlan_info along with flags */
 	vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
 
+	if (filter_mask & RTEXT_FILTER_MST)
+		vinfo_sz += br_mst_info_size(vg);
+
 	if (!(filter_mask & RTEXT_FILTER_CFM_STATUS))
 		return vinfo_sz;
 
@@ -485,7 +488,8 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 			   RTEXT_FILTER_BRVLAN_COMPRESSED |
 			   RTEXT_FILTER_MRP |
 			   RTEXT_FILTER_CFM_CONFIG |
-			   RTEXT_FILTER_CFM_STATUS)) {
+			   RTEXT_FILTER_CFM_STATUS |
+			   RTEXT_FILTER_MST)) {
 		af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
 		if (!af)
 			goto nla_put_failure;
@@ -564,7 +568,28 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 		nla_nest_end(skb, cfm_nest);
 	}
 
+	if ((filter_mask & RTEXT_FILTER_MST) &&
+	    br_opt_get(br, BROPT_MST_ENABLED) && port) {
+		const struct net_bridge_vlan_group *vg = nbp_vlan_group(port);
+		struct nlattr *mst_nest;
+		int err;
+
+		if (!vg || !vg->num_vlans)
+			goto done;
+
+		mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST);
+		if (!mst_nest)
+			goto nla_put_failure;
+
+		err = br_mst_fill_info(skb, vg);
+		if (err)
+			goto nla_put_failure;
+
+		nla_nest_end(skb, mst_nest);
+	}
+
 done:
+
 	if (af)
 		nla_nest_end(skb, af);
 	nlmsg_end(skb, nlh);
@@ -803,6 +828,23 @@ static int br_afspec(struct net_bridge *br,
 			if (err)
 				return err;
 			break;
+		case IFLA_BRIDGE_MST:
+			if (!p) {
+				NL_SET_ERR_MSG(extack,
+					       "MST states can only be set on bridge ports");
+				return -EINVAL;
+			}
+
+			if (cmd != RTM_SETLINK) {
+				NL_SET_ERR_MSG(extack,
+					       "MST states can only be set through RTM_SETLINK");
+				return -EINVAL;
+			}
+
+			err = br_mst_process(p, attr, extack);
+			if (err)
+				return err;
+			break;
 		}
 	}
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3978e1d9ffb5..18ccc3d5d296 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1784,6 +1784,11 @@ int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti);
 void br_mst_vlan_init_state(struct net_bridge_vlan *v);
 int br_mst_set_enabled(struct net_bridge *br, bool on,
 		       struct netlink_ext_ack *extack);
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg);
+int br_mst_fill_info(struct sk_buff *skb,
+		     const struct net_bridge_vlan_group *vg);
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+		   struct netlink_ext_ack *extack);
 #else
 static inline bool br_mst_is_enabled(struct net_bridge *br)
 {
@@ -1801,6 +1806,24 @@ static inline int br_mst_set_enabled(struct net_bridge *br, bool on,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+	return 0;
+}
+
+static inline int br_mst_fill_info(struct sk_buff *skb,
+				   const struct net_bridge_vlan_group *vg)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int br_mst_process(struct net_bridge_port *p,
+				 const struct nlattr *mst_attr,
+				 struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 struct nf_br_ops {
-- 
cgit v1.2.3


From 0dcac272540613d41c05e89679e4ddb978b612f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:09 +0100
Subject: bpf: Add multi kprobe link

Adding new link type BPF_LINK_TYPE_KPROBE_MULTI that attaches kprobe
program through fprobe API.

The fprobe API allows to attach probe on multiple functions at once
very fast, because it works on top of ftrace. On the other hand this
limits the probe point to the function entry or return.

The kprobe program gets the same pt_regs input ctx as when it's attached
through the perf API.

Adding new attach type BPF_TRACE_KPROBE_MULTI that allows attachment
kprobe to multiple function with new link.

User provides array of addresses or symbols with count to attach the
kprobe program to. The new link_create uapi interface looks like:

  struct {
          __u32           flags;
          __u32           cnt;
          __aligned_u64   syms;
          __aligned_u64   addrs;
  } kprobe_multi;

The flags field allows single BPF_TRACE_KPROBE_MULTI bit to create
return multi kprobe.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-4-jolsa@kernel.org
---
 include/linux/bpf_types.h      |   1 +
 include/linux/trace_events.h   |   7 ++
 include/uapi/linux/bpf.h       |  13 +++
 kernel/bpf/syscall.c           |  26 ++++-
 kernel/trace/bpf_trace.c       | 211 +++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  13 +++
 6 files changed, 266 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 48a91c51c015..3e24ad0c4b3c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -140,3 +140,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 #ifdef CONFIG_PERF_EVENTS
 BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 #endif
+BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index dcea51fb60e2..8f0e9e7cb493 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -15,6 +15,7 @@ struct array_buffer;
 struct tracer;
 struct dentry;
 struct bpf_prog;
+union bpf_attr;
 
 const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
 				  unsigned long flags,
@@ -738,6 +739,7 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
 			    u64 *probe_offset, u64 *probe_addr);
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -779,6 +781,11 @@ static inline int bpf_get_perf_event_info(const struct perf_event *event,
 {
 	return -EOPNOTSUPP;
 }
+static inline int
+bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 99fab54ae9c0..d77f47af7752 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -997,6 +997,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
+	BPF_TRACE_KPROBE_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1011,6 +1012,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
+	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1118,6 +1120,11 @@ enum bpf_link_type {
  */
 #define BPF_F_XDP_HAS_FRAGS	(1U << 5)
 
+/* link_create.kprobe_multi.flags used in LINK_CREATE command for
+ * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
+ */
+#define BPF_F_KPROBE_MULTI_RETURN	(1U << 0)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
@@ -1475,6 +1482,12 @@ union bpf_attr {
 				 */
 				__u64		bpf_cookie;
 			} perf_event;
+			struct {
+				__u32		flags;
+				__u32		cnt;
+				__aligned_u64	syms;
+				__aligned_u64	addrs;
+			} kprobe_multi;
 		};
 	} link_create;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9beb585be5a6..b8bb67ee6c57 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -32,6 +32,7 @@
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
+#include <linux/trace_events.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -3022,6 +3023,11 @@ out_put_file:
 	fput(perf_file);
 	return err;
 }
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_PERF_EVENTS */
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
@@ -4255,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	return -EINVAL;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.addrs
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type ptype;
@@ -4279,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		goto out;
 	case BPF_PROG_TYPE_PERF_EVENT:
-	case BPF_PROG_TYPE_KPROBE:
 	case BPF_PROG_TYPE_TRACEPOINT:
 		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
 			ret = -EINVAL;
@@ -4287,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		}
 		ptype = prog->type;
 		break;
+	case BPF_PROG_TYPE_KPROBE:
+		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+		    attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ptype = prog->type;
+		break;
 	default:
 		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
 		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4318,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
 #endif
-#ifdef CONFIG_PERF_EVENTS
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
-	case BPF_PROG_TYPE_KPROBE:
 		ret = bpf_perf_link_attach(attr, prog);
 		break;
-#endif
+	case BPF_PROG_TYPE_KPROBE:
+		if (attr->link_create.attach_type == BPF_PERF_EVENT)
+			ret = bpf_perf_link_attach(attr, prog);
+		else
+			ret = bpf_kprobe_multi_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a2024ba32a20..fffa2171fae4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,7 @@
 #include <linux/error-injection.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -2181,3 +2182,213 @@ static int __init bpf_event_init(void)
 
 fs_initcall(bpf_event_init);
 #endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+	struct bpf_link link;
+	struct fprobe fp;
+	unsigned long *addrs;
+};
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+	struct bpf_kprobe_multi_link *kmulti_link;
+
+	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+	unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_kprobe_multi_link *kmulti_link;
+
+	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+	kvfree(kmulti_link->addrs);
+	kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+	.release = bpf_kprobe_multi_link_release,
+	.dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+			   struct pt_regs *regs)
+{
+	int err;
+
+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		err = 0;
+		goto out;
+	}
+
+	migrate_disable();
+	rcu_read_lock();
+	err = bpf_prog_run(link->link.prog, regs);
+	rcu_read_unlock();
+	migrate_enable();
+
+ out:
+	__this_cpu_dec(bpf_prog_active);
+	return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+			  struct pt_regs *regs)
+{
+	unsigned long saved_ip = instruction_pointer(regs);
+	struct bpf_kprobe_multi_link *link;
+
+	/*
+	 * Because fprobe's regs->ip is set to the next instruction of
+	 * dynamic-ftrace instruction, correct entry ip must be set, so
+	 * that the bpf program can access entry address via regs as same
+	 * as kprobes.
+	 *
+	 * Both kprobe and kretprobe see the entry ip of traced function
+	 * as instruction pointer.
+	 */
+	instruction_pointer_set(regs, entry_ip);
+
+	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+	kprobe_multi_link_prog_run(link, regs);
+
+	instruction_pointer_set(regs, saved_ip);
+}
+
+static int
+kprobe_multi_resolve_syms(const void *usyms, u32 cnt,
+			  unsigned long *addrs)
+{
+	unsigned long addr, size;
+	const char **syms;
+	int err = -ENOMEM;
+	unsigned int i;
+	char *func;
+
+	size = cnt * sizeof(*syms);
+	syms = kvzalloc(size, GFP_KERNEL);
+	if (!syms)
+		return -ENOMEM;
+
+	func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+	if (!func)
+		goto error;
+
+	if (copy_from_user(syms, usyms, size)) {
+		err = -EFAULT;
+		goto error;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+		if (err == KSYM_NAME_LEN)
+			err = -E2BIG;
+		if (err < 0)
+			goto error;
+		err = -EINVAL;
+		addr = kallsyms_lookup_name(func);
+		if (!addr)
+			goto error;
+		if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+			goto error;
+		addr = ftrace_location_range(addr, addr + size - 1);
+		if (!addr)
+			goto error;
+		addrs[i] = addr;
+	}
+
+	err = 0;
+error:
+	kvfree(syms);
+	kfree(func);
+	return err;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_kprobe_multi_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	unsigned long *addrs;
+	u32 flags, cnt, size;
+	void __user *uaddrs;
+	void __user *usyms;
+	int err;
+
+	/* no support for 32bit archs yet */
+	if (sizeof(u64) != sizeof(void *))
+		return -EOPNOTSUPP;
+
+	if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+		return -EINVAL;
+
+	flags = attr->link_create.kprobe_multi.flags;
+	if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+		return -EINVAL;
+
+	uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+	usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+	if (!!uaddrs == !!usyms)
+		return -EINVAL;
+
+	cnt = attr->link_create.kprobe_multi.cnt;
+	if (!cnt)
+		return -EINVAL;
+
+	size = cnt * sizeof(*addrs);
+	addrs = kvmalloc(size, GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+
+	if (uaddrs) {
+		if (copy_from_user(addrs, uaddrs, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	} else {
+		err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+		if (err)
+			goto error;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_KERNEL);
+	if (!link) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+		      &bpf_kprobe_multi_link_lops, prog);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto error;
+
+	if (flags & BPF_F_KPROBE_MULTI_RETURN)
+		link->fp.exit_handler = kprobe_multi_link_handler;
+	else
+		link->fp.entry_handler = kprobe_multi_link_handler;
+
+	link->addrs = addrs;
+
+	err = register_fprobe_ips(&link->fp, addrs, cnt);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+
+error:
+	kfree(link);
+	kvfree(addrs);
+	return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 99fab54ae9c0..d77f47af7752 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -997,6 +997,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
+	BPF_TRACE_KPROBE_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1011,6 +1012,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
+	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1118,6 +1120,11 @@ enum bpf_link_type {
  */
 #define BPF_F_XDP_HAS_FRAGS	(1U << 5)
 
+/* link_create.kprobe_multi.flags used in LINK_CREATE command for
+ * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
+ */
+#define BPF_F_KPROBE_MULTI_RETURN	(1U << 0)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
@@ -1475,6 +1482,12 @@ union bpf_attr {
 				 */
 				__u64		bpf_cookie;
 			} perf_event;
+			struct {
+				__u32		flags;
+				__u32		cnt;
+				__aligned_u64	syms;
+				__aligned_u64	addrs;
+			} kprobe_multi;
 		};
 	} link_create;
 
-- 
cgit v1.2.3


From ca74823c6e16dd42b7cf60d9fdde80e2a81a67bb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:12 +0100
Subject: bpf: Add cookie support to programs attached with kprobe multi link

Adding support to call bpf_get_attach_cookie helper from
kprobe programs attached with kprobe multi link.

The cookie is provided by array of u64 values, where each
value is paired with provided function address or symbol
with the same array index.

When cookie array is provided it's sorted together with
addresses (check bpf_kprobe_multi_cookie_swap). This way
we can find cookie based on the address in
bpf_get_attach_cookie helper.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-7-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/syscall.c           |   2 +-
 kernel/trace/bpf_trace.c       | 114 ++++++++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |   1 +
 4 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d77f47af7752..7604e7d5438f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1487,6 +1487,7 @@ union bpf_attr {
 				__u32		cnt;
 				__aligned_u64	syms;
 				__aligned_u64	addrs;
+				__aligned_u64	cookies;
 			} kprobe_multi;
 		};
 	} link_create;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8bb67ee6c57..cdaa1152436a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4261,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	return -EINVAL;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.addrs
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type ptype;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0e7f8c9bc756..9a7b6be655e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -18,6 +18,8 @@
 #include <linux/btf_ids.h>
 #include <linux/bpf_lsm.h>
 #include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -78,6 +80,7 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
 				  u64 flags, const struct btf **btf,
 				  s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -1050,6 +1053,18 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+	return bpf_kprobe_multi_cookie(current->bpf_ctx, instruction_pointer(regs));
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+	.func		= bpf_get_attach_cookie_kprobe_multi,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
 {
 	struct bpf_trace_run_ctx *run_ctx;
@@ -1297,7 +1312,9 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			&bpf_get_func_ip_proto_kprobe_multi :
 			&bpf_get_func_ip_proto_kprobe;
 	case BPF_FUNC_get_attach_cookie:
-		return &bpf_get_attach_cookie_proto_trace;
+		return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+			&bpf_get_attach_cookie_proto_kmulti :
+			&bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -2203,6 +2220,13 @@ struct bpf_kprobe_multi_link {
 	struct bpf_link link;
 	struct fprobe fp;
 	unsigned long *addrs;
+	/*
+	 * The run_ctx here is used to get struct bpf_kprobe_multi_link in
+	 * get_attach_cookie helper, so it can't be used to store data.
+	 */
+	struct bpf_run_ctx run_ctx;
+	u64 *cookies;
+	u32 cnt;
 };
 
 static void bpf_kprobe_multi_link_release(struct bpf_link *link)
@@ -2219,6 +2243,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
 
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	kvfree(kmulti_link->addrs);
+	kvfree(kmulti_link->cookies);
 	kfree(kmulti_link);
 }
 
@@ -2227,10 +2252,60 @@ static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
 	.dealloc = bpf_kprobe_multi_link_dealloc,
 };
 
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+	const struct bpf_kprobe_multi_link *link = priv;
+	unsigned long *addr_a = a, *addr_b = b;
+	u64 *cookie_a, *cookie_b;
+	unsigned long tmp1;
+	u64 tmp2;
+
+	cookie_a = link->cookies + (addr_a - link->addrs);
+	cookie_b = link->cookies + (addr_b - link->addrs);
+
+	/* swap addr_a/addr_b and cookie_a/cookie_b values */
+	tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
+	tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+	const unsigned long *addr_a = a, *addr_b = b;
+
+	if (*addr_a == *addr_b)
+		return 0;
+	return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+	return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+{
+	struct bpf_kprobe_multi_link *link;
+	unsigned long *addr;
+	u64 *cookie;
+
+	if (WARN_ON_ONCE(!ctx))
+		return 0;
+	link = container_of(ctx, struct bpf_kprobe_multi_link, run_ctx);
+	if (!link->cookies)
+		return 0;
+	addr = bsearch(&ip, link->addrs, link->cnt, sizeof(ip),
+		       __bpf_kprobe_multi_cookie_cmp);
+	if (!addr)
+		return 0;
+	cookie = link->cookies + (addr - link->addrs);
+	return *cookie;
+}
+
 static int
 kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 			   struct pt_regs *regs)
 {
+	struct bpf_run_ctx *old_run_ctx;
 	int err;
 
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
@@ -2240,7 +2315,9 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 
 	migrate_disable();
 	rcu_read_lock();
+	old_run_ctx = bpf_set_run_ctx(&link->run_ctx);
 	err = bpf_prog_run(link->link.prog, regs);
+	bpf_reset_run_ctx(old_run_ctx);
 	rcu_read_unlock();
 	migrate_enable();
 
@@ -2326,9 +2403,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	struct bpf_kprobe_multi_link *link = NULL;
 	struct bpf_link_primer link_primer;
+	void __user *ucookies;
 	unsigned long *addrs;
 	u32 flags, cnt, size;
 	void __user *uaddrs;
+	u64 *cookies = NULL;
 	void __user *usyms;
 	int err;
 
@@ -2368,6 +2447,19 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 			goto error;
 	}
 
+	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+	if (ucookies) {
+		cookies = kvmalloc(size, GFP_KERNEL);
+		if (!cookies) {
+			err = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user(cookies, ucookies, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	}
+
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link) {
 		err = -ENOMEM;
@@ -2387,6 +2479,21 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		link->fp.entry_handler = kprobe_multi_link_handler;
 
 	link->addrs = addrs;
+	link->cookies = cookies;
+	link->cnt = cnt;
+
+	if (cookies) {
+		/*
+		 * Sorting addresses will trigger sorting cookies as well
+		 * (check bpf_kprobe_multi_cookie_swap). This way we can
+		 * find cookie based on the address in bpf_get_attach_cookie
+		 * helper.
+		 */
+		sort_r(addrs, cnt, sizeof(*addrs),
+		       bpf_kprobe_multi_cookie_cmp,
+		       bpf_kprobe_multi_cookie_swap,
+		       link);
+	}
 
 	err = register_fprobe_ips(&link->fp, addrs, cnt);
 	if (err) {
@@ -2399,6 +2506,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 error:
 	kfree(link);
 	kvfree(addrs);
+	kvfree(cookies);
 	return err;
 }
 #else /* !CONFIG_FPROBE */
@@ -2406,4 +2514,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	return -EOPNOTSUPP;
 }
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+{
+	return 0;
+}
 #endif
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d77f47af7752..7604e7d5438f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1487,6 +1487,7 @@ union bpf_attr {
 				__u32		cnt;
 				__aligned_u64	syms;
 				__aligned_u64	addrs;
+				__aligned_u64	cookies;
 			} kprobe_multi;
 		};
 	} link_create;
-- 
cgit v1.2.3


From 54f586a9153201c6cff55e1f561990c78bd99aa7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 16 Mar 2022 21:27:51 +0100
Subject: rfkill: make new event layout opt-in

Again new complaints surfaced that we had broken the ABI here,
although previously all the userspace tools had agreed that it
was their mistake and fixed it. Yet now there are cases (e.g.
RHEL) that want to run old userspace with newer kernels, and
thus are broken.

Since this is a bit of a whack-a-mole thing, change the whole
extensibility scheme of rfkill to no longer just rely on the
message lengths, but instead require userspace to opt in via a
new ioctl to a given maximum event size that it is willing to
understand.

By default, set that to RFKILL_EVENT_SIZE_V1 (8), so that the
behaviour for userspace not calling the ioctl will look as if
it's just running on an older kernel.

Fixes: 14486c82612a ("rfkill: add a reason to the HW rfkill state")
Cc: stable@vger.kernel.org # 5.11+
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220316212749.16491491b270.Ifcb1950998330a596f29a2a162e00b7546a1d6d0@changeid
---
 include/uapi/linux/rfkill.h | 14 +++++++++++--
 net/rfkill/core.c           | 48 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 46 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 9b77cfc42efa..283c5a7b3f2c 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -159,8 +159,16 @@ struct rfkill_event_ext {
  * old behaviour for all userspace, unless it explicitly opts in to the
  * rules outlined here by using the new &struct rfkill_event_ext.
  *
- * Userspace using &struct rfkill_event_ext must adhere to the following
- * rules
+ * Additionally, some other userspace (bluez, g-s-d) was reading with a
+ * large size but as streaming reads rather than message-based, or with
+ * too strict checks for the returned size. So eventually, we completely
+ * reverted this, and extended messages need to be opted in to by using
+ * an ioctl:
+ *
+ *  ioctl(fd, RFKILL_IOCTL_MAX_SIZE, sizeof(struct rfkill_event_ext));
+ *
+ * Userspace using &struct rfkill_event_ext and the ioctl must adhere to
+ * the following rules:
  *
  * 1. accept short writes, optionally using them to detect that it's
  *    running on an older kernel;
@@ -175,6 +183,8 @@ struct rfkill_event_ext {
 #define RFKILL_IOC_MAGIC	'R'
 #define RFKILL_IOC_NOINPUT	1
 #define RFKILL_IOCTL_NOINPUT	_IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT)
+#define RFKILL_IOC_MAX_SIZE	2
+#define RFKILL_IOCTL_MAX_SIZE	_IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32)
 
 /* and that's all userspace gets */
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 5b1927d66f0d..dac4fdc7488a 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -78,6 +78,7 @@ struct rfkill_data {
 	struct mutex		mtx;
 	wait_queue_head_t	read_wait;
 	bool			input_handler;
+	u8			max_size;
 };
 
 
@@ -1153,6 +1154,8 @@ static int rfkill_fop_open(struct inode *inode, struct file *file)
 	if (!data)
 		return -ENOMEM;
 
+	data->max_size = RFKILL_EVENT_SIZE_V1;
+
 	INIT_LIST_HEAD(&data->events);
 	mutex_init(&data->mtx);
 	init_waitqueue_head(&data->read_wait);
@@ -1235,6 +1238,7 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
 				list);
 
 	sz = min_t(unsigned long, sizeof(ev->ev), count);
+	sz = min_t(unsigned long, sz, data->max_size);
 	ret = sz;
 	if (copy_to_user(buf, &ev->ev, sz))
 		ret = -EFAULT;
@@ -1249,6 +1253,7 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
 static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *pos)
 {
+	struct rfkill_data *data = file->private_data;
 	struct rfkill *rfkill;
 	struct rfkill_event_ext ev;
 	int ret;
@@ -1263,6 +1268,7 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
 	 * our API version even in a write() call, if it cares.
 	 */
 	count = min(count, sizeof(ev));
+	count = min_t(size_t, count, data->max_size);
 	if (copy_from_user(&ev, buf, count))
 		return -EFAULT;
 
@@ -1322,31 +1328,47 @@ static int rfkill_fop_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-#ifdef CONFIG_RFKILL_INPUT
 static long rfkill_fop_ioctl(struct file *file, unsigned int cmd,
 			     unsigned long arg)
 {
 	struct rfkill_data *data = file->private_data;
+	int ret = -ENOSYS;
+	u32 size;
 
 	if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC)
 		return -ENOSYS;
 
-	if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT)
-		return -ENOSYS;
-
 	mutex_lock(&data->mtx);
-
-	if (!data->input_handler) {
-		if (atomic_inc_return(&rfkill_input_disabled) == 1)
-			printk(KERN_DEBUG "rfkill: input handler disabled\n");
-		data->input_handler = true;
+	switch (_IOC_NR(cmd)) {
+#ifdef CONFIG_RFKILL_INPUT
+	case RFKILL_IOC_NOINPUT:
+		if (!data->input_handler) {
+			if (atomic_inc_return(&rfkill_input_disabled) == 1)
+				printk(KERN_DEBUG "rfkill: input handler disabled\n");
+			data->input_handler = true;
+		}
+		ret = 0;
+		break;
+#endif
+	case RFKILL_IOC_MAX_SIZE:
+		if (get_user(size, (__u32 __user *)arg)) {
+			ret = -EFAULT;
+			break;
+		}
+		if (size < RFKILL_EVENT_SIZE_V1 || size > U8_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+		data->max_size = size;
+		ret = 0;
+		break;
+	default:
+		break;
 	}
-
 	mutex_unlock(&data->mtx);
 
-	return 0;
+	return ret;
 }
-#endif
 
 static const struct file_operations rfkill_fops = {
 	.owner		= THIS_MODULE,
@@ -1355,10 +1377,8 @@ static const struct file_operations rfkill_fops = {
 	.write		= rfkill_fop_write,
 	.poll		= rfkill_fop_poll,
 	.release	= rfkill_fop_release,
-#ifdef CONFIG_RFKILL_INPUT
 	.unlocked_ioctl	= rfkill_fop_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
-#endif
 	.llseek		= no_llseek,
 };
 
-- 
cgit v1.2.3


From 73799a889262b4675799bec20a2765be6d6a3f98 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 15 Mar 2022 15:38:54 -0400
Subject: counter: add new COUNTER_EVENT_CHANGE_OF_STATE

Add new counter event to notify user space about every new counter
pulse.

Link: https://lore.kernel.org/r/20220203135727.2374052-2-o.rempel@pengutronix.de
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Link: https://lore.kernel.org/r/486a5de67414470449efb84d06a2f2214f4bb31d.1647373009.git.vilhelm.gray@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/counter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h
index d0aa95aeff7b..96c5ffd368ad 100644
--- a/include/uapi/linux/counter.h
+++ b/include/uapi/linux/counter.h
@@ -61,6 +61,8 @@ enum counter_event_type {
 	COUNTER_EVENT_THRESHOLD,
 	/* Index signal detected */
 	COUNTER_EVENT_INDEX,
+	/* State of counter is changed */
+	COUNTER_EVENT_CHANGE_OF_STATE,
 };
 
 /**
-- 
cgit v1.2.3


From 336d4b814bf078fa698488632c19beca47308896 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:15:32 -0600
Subject: ptrace: Move setting/clearing ptrace_message into ptrace_stop

Today ptrace_message is easy to overlook as it not a core part of
ptrace_stop.  It has been overlooked so much that there are places
that set ptrace_message and don't clear it, and places that never set
it.  So if you get an unlucky sequence of events the ptracer may be
able to read a ptrace_message that does not apply to the current
ptrace stop.

Move setting of ptrace_message into ptrace_stop so that it always gets
set before the stop, and always gets cleared after the stop.  This
prevents non-sense from being reported to userspace and makes
ptrace_message more visible in the ptrace helper functions so that
kernel developers can see it.

Link: https://lkml.kernel.org/r/87bky67qfv.fsf_-_@email.froward.int.ebiederm.org
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h      |  9 +++------
 include/uapi/linux/ptrace.h |  2 +-
 kernel/signal.c             | 21 ++++++++++++---------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 5310f43e4762..3e6b46e2b7be 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -60,7 +60,7 @@ extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
-extern void ptrace_notify(int exit_code);
+extern void ptrace_notify(int exit_code, unsigned long message);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent,
 			  const struct cred *ptracer_cred);
@@ -155,8 +155,7 @@ static inline bool ptrace_event_enabled(struct task_struct *task, int event)
 static inline void ptrace_event(int event, unsigned long message)
 {
 	if (unlikely(ptrace_event_enabled(current, event))) {
-		current->ptrace_message = message;
-		ptrace_notify((event << 8) | SIGTRAP);
+		ptrace_notify((event << 8) | SIGTRAP, message);
 	} else if (event == PTRACE_EVENT_EXEC) {
 		/* legacy EXEC report via SIGTRAP */
 		if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
@@ -424,8 +423,7 @@ static inline int ptrace_report_syscall(unsigned long message)
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
-	current->ptrace_message = message;
-	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message);
 
 	/*
 	 * this isn't the same as continuing with a signal, but it will do
@@ -437,7 +435,6 @@ static inline int ptrace_report_syscall(unsigned long message)
 		current->exit_code = 0;
 	}
 
-	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
 
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index b7af92e07d1f..195ae64a8c87 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -114,7 +114,7 @@ struct ptrace_rseq_configuration {
 
 /*
  * These values are stored in task->ptrace_message
- * by ptrace_report_syscall_* to describe the current syscall-stop.
+ * by ptrace_stop to describe the current syscall-stop.
  */
 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
diff --git a/kernel/signal.c b/kernel/signal.c
index c2dee5420567..a49ac7149256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2191,7 +2191,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * If we actually decide not to stop at all because the tracer
  * is gone, we keep current->exit_code unless clear_code.
  */
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code,
+			unsigned long message, kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
@@ -2237,6 +2238,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	smp_wmb();
 
+	current->ptrace_message = message;
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
 
@@ -2315,6 +2317,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	spin_lock_irq(&current->sighand->siglock);
 	current->last_siginfo = NULL;
+	current->ptrace_message = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
 	current->jobctl &= ~JOBCTL_LISTENING;
@@ -2327,7 +2330,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	recalc_sigpending_tsk(current);
 }
 
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
 {
 	kernel_siginfo_t info;
 
@@ -2338,17 +2341,17 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	ptrace_stop(exit_code, why, 1, &info);
+	ptrace_stop(exit_code, why, 1, message, &info);
 }
 
-void ptrace_notify(int exit_code)
+void ptrace_notify(int exit_code, unsigned long message)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -2504,10 +2507,10 @@ static void do_jobctl_trap(void)
 			signr = SIGTRAP;
 		WARN_ON_ONCE(!signr);
 		ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
-				 CLD_STOPPED);
+				 CLD_STOPPED, 0);
 	} else {
 		WARN_ON_ONCE(!signr);
-		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
 		current->exit_code = 0;
 	}
 }
@@ -2561,7 +2564,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	ptrace_stop(signr, CLD_TRAPPED, 0, info);
+	ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
 	signr = current->exit_code;
@@ -2891,7 +2894,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		sas_ss_reset(current);
 	if (stepping)
-		ptrace_notify(SIGTRAP);
+		ptrace_notify(SIGTRAP, 0);
 }
 
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
-- 
cgit v1.2.3


From ee2a098851bfbe8bcdd964c0121f4246f00ff41e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 14 Mar 2022 11:20:41 -0700
Subject: bpf: Adjust BPF stack helper functions to accommodate skip > 0

Let's say that the caller has storage for num_elem stack frames.  Then,
the BPF stack helper functions walk the stack for only num_elem frames.
This means that if skip > 0, one keeps only 'num_elem - skip' frames.

This is because it sets init_nr in the perf_callchain_entry to the end
of the buffer to save num_elem entries only.  I believe it was because
the perf callchain code unwound the stack frames until it reached the
global max size (sysctl_perf_event_max_stack).

However it now has perf_callchain_entry_ctx.max_stack to limit the
iteration locally.  This simplifies the code to handle init_nr in the
BPF callstack entries and removes the confusion with the perf_event's
__PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0.

Also change the comment on bpf_get_stack() in the header file to be
more explicit what the return value means.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com
Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org

Based-on-patch-by: Eugene Loh <eugene.loh@oracle.com>
---
 include/uapi/linux/bpf.h |  8 +++----
 kernel/bpf/stackmap.c    | 56 +++++++++++++++++++++---------------------------
 2 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7604e7d5438f..d14b10b85e51 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3009,8 +3009,8 @@ union bpf_attr {
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
  * 	Return
- * 		A non-negative value equal to or less than *size* on success,
- * 		or a negative error in case of failure.
+ * 		The non-negative copied *buf* length equal to or less than
+ * 		*size* on success, or a negative error in case of failure.
  *
  * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -4316,8 +4316,8 @@ union bpf_attr {
  *
  *			# sysctl kernel.perf_event_max_stack=<new value>
  *	Return
- *		A non-negative value equal to or less than *size* on success,
- *		or a negative error in case of failure.
+ * 		The non-negative copied *buf* length equal to or less than
+ * 		*size* on success, or a negative error in case of failure.
  *
  * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
  *	Description
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 38bdfcd06f55..34725bfa1e97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -176,7 +176,7 @@ build_id_valid:
 }
 
 static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
 {
 #ifdef CONFIG_STACKTRACE
 	struct perf_callchain_entry *entry;
@@ -187,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 	if (!entry)
 		return NULL;
 
-	entry->nr = init_nr +
-		stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
-				     sysctl_perf_event_max_stack - init_nr, 0);
+	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+					 max_depth, 0);
 
 	/* stack_trace_save_tsk() works on unsigned long array, while
 	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -201,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 		int i;
 
 		/* copy data from the end to avoid using extra buffer */
-		for (i = entry->nr - 1; i >= (int)init_nr; i--)
+		for (i = entry->nr - 1; i >= 0; i--)
 			to[i] = (u64)(from[i]);
 	}
 
@@ -218,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-	u32 max_depth = map->value_size / stack_map_data_size(map);
-	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
-	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
 	u64 *ips;
 	bool hash_matches;
 
-	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
-	 */
-	trace_nr = trace->nr - init_nr;
-
-	if (trace_nr <= skip)
+	if (trace->nr <= skip)
 		/* skipping more than usable stack trace */
 		return -EFAULT;
 
-	trace_nr -= skip;
+	trace_nr = trace->nr - skip;
 	trace_len = trace_nr * sizeof(u64);
-	ips = trace->ip + skip + init_nr;
+	ips = trace->ip + skip;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
 	bucket = READ_ONCE(smap->buckets[id]);
@@ -295,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags)
 {
 	u32 max_depth = map->value_size / stack_map_data_size(map);
-	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
-	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
 	struct perf_callchain_entry *trace;
 	bool kernel = !user;
@@ -305,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
+	max_depth += skip;
+	if (max_depth > sysctl_perf_event_max_stack)
+		max_depth = sysctl_perf_event_max_stack;
+
+	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+				   false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -397,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 			    struct perf_callchain_entry *trace_in,
 			    void *buf, u32 size, u64 flags)
 {
-	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
@@ -422,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		goto err_fault;
 
 	num_elem = size / elem_size;
-	if (sysctl_perf_event_max_stack < num_elem)
-		init_nr = 0;
-	else
-		init_nr = sysctl_perf_event_max_stack - num_elem;
+	max_depth = num_elem + skip;
+	if (sysctl_perf_event_max_stack < max_depth)
+		max_depth = sysctl_perf_event_max_stack;
 
 	if (trace_in)
 		trace = trace_in;
 	else if (kernel && task)
-		trace = get_callchain_entry_for_task(task, init_nr);
+		trace = get_callchain_entry_for_task(task, max_depth);
 	else
-		trace = get_perf_callchain(regs, init_nr, kernel, user,
-					   sysctl_perf_event_max_stack,
+		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
 					   false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
-	trace_nr = trace->nr - init_nr;
-	if (trace_nr < skip)
+	if (trace->nr < skip)
 		goto err_fault;
 
-	trace_nr -= skip;
+	trace_nr = trace->nr - skip;
 	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
 	copy_len = trace_nr * elem_size;
-	ips = trace->ip + skip + init_nr;
+
+	ips = trace->ip + skip;
 	if (user && user_build_id)
 		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
 	else
-- 
cgit v1.2.3


From 6d8491910fcd3324d0f0ece3bd68e85ead3a04d7 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Tue, 1 Mar 2022 06:03:47 +0000
Subject: KVM: x86: Introduce KVM_CAP_DISABLE_QUIRKS2

KVM_CAP_DISABLE_QUIRKS is irrevocably broken. The capability does not
advertise the set of quirks which may be disabled to userspace, so it is
impossible to predict the behavior of KVM. Worse yet,
KVM_CAP_DISABLE_QUIRKS will tolerate any value for cap->args[0], meaning
it fails to reject attempts to set invalid quirk bits.

The only valid workaround for the quirky quirks API is to add a new CAP.
Actually advertise the set of quirks that can be disabled to userspace
so it can predict KVM's behavior. Reject values for cap->args[0] that
contain invalid bits.

Finally, add documentation for the new capability and describe the
existing quirks.

Signed-off-by: Oliver Upton <oupton@google.com>
Message-Id: <20220301060351.442881-5-oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 50 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  7 ++++++
 arch/x86/kvm/x86.c              |  8 +++++++
 include/uapi/linux/kvm.h        |  1 +
 4 files changed, 66 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 55efa82e37eb..07a45474abe9 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7083,6 +7083,56 @@ resource that is controlled with the H_SET_MODE hypercall.
 This capability allows a guest kernel to use a better-performance mode for
 handling interrupts and system calls.
 
+7.31 KVM_CAP_DISABLE_QUIRKS2
+----------------------------
+
+:Capability: KVM_CAP_DISABLE_QUIRKS2
+:Parameters: args[0] - set of KVM quirks to disable
+:Architectures: x86
+:Type: vm
+
+This capability, if enabled, will cause KVM to disable some behavior
+quirks.
+
+Calling KVM_CHECK_EXTENSION for this capability returns a bitmask of
+quirks that can be disabled in KVM.
+
+The argument to KVM_ENABLE_CAP for this capability is a bitmask of
+quirks to disable, and must be a subset of the bitmask returned by
+KVM_CHECK_EXTENSION.
+
+The valid bits in cap.args[0] are:
+
+=================================== ============================================
+ KVM_X86_QUIRK_LINT0_REENABLED      By default, the reset value for the LVT
+                                    LINT0 register is 0x700 (APIC_MODE_EXTINT).
+                                    When this quirk is disabled, the reset value
+                                    is 0x10000 (APIC_LVT_MASKED).
+
+ KVM_X86_QUIRK_CD_NW_CLEARED        By default, KVM clears CR0.CD and CR0.NW.
+                                    When this quirk is disabled, KVM does not
+                                    change the value of CR0.CD and CR0.NW.
+
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE      By default, the MMIO LAPIC interface is
+                                    available even when configured for x2APIC
+                                    mode. When this quirk is disabled, KVM
+                                    disables the MMIO LAPIC interface if the
+                                    LAPIC is in x2APIC mode.
+
+ KVM_X86_QUIRK_OUT_7E_INC_RIP       By default, KVM pre-increments %rip before
+                                    exiting to userspace for an OUT instruction
+                                    to port 0x7e. When this quirk is disabled,
+                                    KVM does not pre-increment %rip before
+                                    exiting to userspace.
+
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT When this quirk is disabled, KVM sets
+                                    CPUID.01H:ECX[bit 3] (MONITOR/MWAIT) if
+                                    IA32_MISC_ENABLE[bit 18] (MWAIT) is set.
+                                    Additionally, when this quirk is disabled,
+                                    KVM clears CPUID.01H:ECX[bit 3] if
+                                    IA32_MISC_ENABLE[bit 18] is cleared.
+=================================== ============================================
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a2c855f04e3..0ddc2e67a731 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1966,4 +1966,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 #define KVM_CLOCK_VALID_FLAGS						\
 	(KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
 
+#define KVM_X86_VALID_QUIRKS			\
+	(KVM_X86_QUIRK_LINT0_REENABLED |	\
+	 KVM_X86_QUIRK_CD_NW_CLEARED |		\
+	 KVM_X86_QUIRK_LAPIC_MMIO_HOLE |	\
+	 KVM_X86_QUIRK_OUT_7E_INC_RIP |		\
+	 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c58265d9f1b2..fe2171b11441 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4363,6 +4363,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
 		break;
 	}
+	case KVM_CAP_DISABLE_QUIRKS2:
+		r = KVM_X86_VALID_QUIRKS;
+		break;
 	default:
 		break;
 	}
@@ -5909,6 +5912,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		return -EINVAL;
 
 	switch (cap->cap) {
+	case KVM_CAP_DISABLE_QUIRKS2:
+		r = -EINVAL;
+		if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+			break;
+		fallthrough;
 	case KVM_CAP_DISABLE_QUIRKS:
 		kvm->arch.disabled_quirks = cap->args[0];
 		r = 0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d2f1efc3aa35..91a6fe4e02c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1143,6 +1143,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_AIL_MODE_3 210
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
+#define KVM_CAP_DISABLE_QUIRKS2 213
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 824ddc601adc2cc48efb7f58b57997986c1c1276 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Tue, 22 Mar 2022 14:45:32 -0700
Subject: userfaultfd: provide unmasked address on page-fault

Userfaultfd is supposed to provide the full address (i.e., unmasked) of
the faulting access back to userspace.  However, that is not the case for
quite some time.

Even running "userfaultfd_demo" from the userfaultfd man page provides the
wrong output (and contradicts the man page).  Notice that
"UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) and
not the first read address (0x7fc5e30b300f).

	Address returned by mmap() = 0x7fc5e30b3000

	fault_handler_thread():
	    poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
	    UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000
		(uffdio_copy.copy returned 4096)
	Read address 0x7fc5e30b300f in main(): A
	Read address 0x7fc5e30b340f in main(): A
	Read address 0x7fc5e30b380f in main(): A
	Read address 0x7fc5e30b3c0f in main(): A

The exact address is useful for various reasons and specifically for
prefetching decisions.  If it is known that the memory is populated by
certain objects whose size is not page-aligned, then based on the faulting
address, the uffd-monitor can decide whether to prefetch and prefault the
adjacent page.

This bug has been for quite some time in the kernel: since commit
1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address")
vmf->virtual_address"), which dates back to 2016.  A concern has been
raised that existing userspace application might rely on the old/wrong
behavior in which the address is masked.  Therefore, it was suggested to
provide the masked address unless the user explicitly asks for the exact
address.

Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct
userfaultfd to provide the exact address.  Add a new "real_address" field
to vmf to hold the unmasked address.  Provide the address to userspace
accordingly.

Initialize real_address in various code-paths to be consistent with
address, even when it is not used, to be on the safe side.

[namit@vmware.com: initialize real_address on all code paths, per Jan]
  Link: https://lkml.kernel.org/r/20220226022655.350562-1-namit@vmware.com
[akpm@linux-foundation.org: fix typo in comment, per Jan]

Link: https://lkml.kernel.org/r/20220218041003.3508-1-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 5 ++++-
 include/linux/mm.h               | 3 ++-
 include/uapi/linux/userfaultfd.h | 8 +++++++-
 mm/hugetlb.c                     | 6 ++++--
 mm/memory.c                      | 1 +
 mm/swapfile.c                    | 1 +
 6 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8e03b3d3f5fa..aa0c47cb0d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	struct uffd_msg msg;
 	msg_init(&msg);
 	msg.event = UFFD_EVENT_PAGEFAULT;
+
+	if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+		address &= PAGE_MASK;
 	msg.arg.pagefault.address = address;
 	/*
 	 * These flags indicate why the userfault occurred:
@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+	uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
 			ctx->features);
 	uwq.ctx = ctx;
 	uwq.waken = false;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d58321386cc..0e4fd101616e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -478,7 +478,8 @@ struct vm_fault {
 		struct vm_area_struct *vma;	/* Target VMA */
 		gfp_t gfp_mask;			/* gfp mask to be used for allocations */
 		pgoff_t pgoff;			/* Logical page offset based on vma */
-		unsigned long address;		/* Faulting virtual address */
+		unsigned long address;		/* Faulting virtual address - masked */
+		unsigned long real_address;	/* Faulting virtual address - unmasked */
 	};
 	enum fault_flag flags;		/* FAULT_FLAG_xxx flags
 					 * XXX: should really be 'const' */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 05b31d60acf6..ef739054cb1c 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -32,7 +32,8 @@
 			   UFFD_FEATURE_SIGBUS |		\
 			   UFFD_FEATURE_THREAD_ID |		\
 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
-			   UFFD_FEATURE_MINOR_SHMEM)
+			   UFFD_FEATURE_MINOR_SHMEM |		\
+			   UFFD_FEATURE_EXACT_ADDRESS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -189,6 +190,10 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
 	 * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
+	 *
+	 * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
+	 * faults would be provided and the offset within the page would not be
+	 * masked.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -201,6 +206,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_THREAD_ID			(1<<8)
 #define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
 #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
+#define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f425147e4ce3..75b41879e9e9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5341,6 +5341,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 						  pgoff_t idx,
 						  unsigned int flags,
 						  unsigned long haddr,
+						  unsigned long addr,
 						  unsigned long reason)
 {
 	vm_fault_t ret;
@@ -5348,6 +5349,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 	struct vm_fault vmf = {
 		.vma = vma,
 		.address = haddr,
+		.real_address = addr,
 		.flags = flags,
 
 		/*
@@ -5416,7 +5418,7 @@ retry:
 		/* Check for page in userfault range */
 		if (userfaultfd_missing(vma)) {
 			ret = hugetlb_handle_userfault(vma, mapping, idx,
-						       flags, haddr,
+						       flags, haddr, address,
 						       VM_UFFD_MISSING);
 			goto out;
 		}
@@ -5480,7 +5482,7 @@ retry:
 			unlock_page(page);
 			put_page(page);
 			ret = hugetlb_handle_userfault(vma, mapping, idx,
-						       flags, haddr,
+						       flags, haddr, address,
 						       VM_UFFD_MINOR);
 			goto out;
 		}
diff --git a/mm/memory.c b/mm/memory.c
index 1a55b4c5b5db..e0f3410fa70c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4633,6 +4633,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 	struct vm_fault vmf = {
 		.vma = vma,
 		.address = address & PAGE_MASK,
+		.real_address = address,
 		.flags = flags,
 		.pgoff = linear_page_index(vma, address),
 		.gfp_mask = __get_fault_gfp_mask(vma),
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf0df7aa7158..33c7abb16610 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			struct vm_fault vmf = {
 				.vma = vma,
 				.address = addr,
+				.real_address = addr,
 				.pmd = pmd,
 			};
 
-- 
cgit v1.2.3


From e99653afeb9585350644c5ae4b0ca987cbe8d053 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 9 Mar 2022 17:22:53 +0100
Subject: rtc: add new RTC_FEATURE_ALARM_WAKEUP_ONLY feature

Some RTCs have an IRQ pin that is not connected to a CPU interrupt but
rather directly to a PMIC or power supply. In that case, it is still useful
to be able to set alarms but we shouldn't expect interrupts.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20220309162301.61679-22-alexandre.belloni@bootlin.com
---
 include/uapi/linux/rtc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtc.h b/include/uapi/linux/rtc.h
index 03e5b776e597..97aca4503a6a 100644
--- a/include/uapi/linux/rtc.h
+++ b/include/uapi/linux/rtc.h
@@ -133,7 +133,8 @@ struct rtc_param {
 #define RTC_FEATURE_UPDATE_INTERRUPT	4
 #define RTC_FEATURE_CORRECTION		5
 #define RTC_FEATURE_BACKUP_SWITCH_MODE	6
-#define RTC_FEATURE_CNT			7
+#define RTC_FEATURE_ALARM_WAKEUP_ONLY	7
+#define RTC_FEATURE_CNT			8
 
 /* parameter list */
 #define RTC_PARAM_FEATURES		0
-- 
cgit v1.2.3


From c724c866bb70cb8c607081a26823a1f0ebde4387 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 23 Mar 2022 16:05:29 -0700
Subject: linux/types.h: remove unnecessary __bitwise__

There are no users of "__bitwise__" except the definition of
"__bitwise".  Remove __bitwise__ and define __bitwise directly.

This is a follow-up to 05de97003c77 ("linux/types.h: enable endian
checks for all sparse builds").

[akpm@linux-foundation.org: change the tools/include/linux/types.h definition also]

Link: https://lkml.kernel.org/r/20220310220927.245704-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/types.h  | 5 ++---
 tools/include/linux/types.h | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index f6d2f83cbe29..71696f424ac8 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -20,11 +20,10 @@
  */
 
 #ifdef __CHECKER__
-#define __bitwise__ __attribute__((bitwise))
+#define __bitwise	__attribute__((bitwise))
 #else
-#define __bitwise__
+#define __bitwise
 #endif
-#define __bitwise __bitwise__
 
 typedef __u16 __bitwise __le16;
 typedef __u16 __bitwise __be16;
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h
index 6e14a533ab4e..6c18c54e7d7f 100644
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -43,11 +43,10 @@ typedef __u8  u8;
 typedef __s8  s8;
 
 #ifdef __CHECKER__
-#define __bitwise__ __attribute__((bitwise))
+#define __bitwise	__attribute__((bitwise))
 #else
-#define __bitwise__
+#define __bitwise
 #endif
-#define __bitwise __bitwise__
 
 #define __force
 #define __user
-- 
cgit v1.2.3


From 179fd6ba3bacbf7b19cbdf9b14be109d54318394 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 23 Mar 2022 16:05:32 -0700
Subject: Documentation/sparse: add hints about __CHECKER__

Several attributes depend on __CHECKER__, but previously there was no
clue in the tree about when __CHECKER__ might be defined.  Add hints at
the most common places (__kernel, __user, __iomem, __bitwise) and in the
sparse documentation.

Link: https://lkml.kernel.org/r/20220310220927.245704-3-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: "Michael S . Tsirkin" <mst@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/sparse.rst | 2 ++
 include/linux/compiler_types.h     | 1 +
 include/uapi/linux/types.h         | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index 02102be7ff49..dc791c8d84d1 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -100,3 +100,5 @@ have already built it.
 
 The optional make variable CF can be used to pass arguments to sparse.  The
 build system passes -Wbitwise to sparse automatically.
+
+Note that sparse defines the __CHECKER__ preprocessor symbol.
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 3c1795fdb568..4b3915ce38a4 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -4,6 +4,7 @@
 
 #ifndef __ASSEMBLY__
 
+/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
 #ifdef __CHECKER__
 /* address spaces */
 # define __kernel	__attribute__((address_space(0)))
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index 71696f424ac8..c4dc597f3dcf 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -19,6 +19,7 @@
  * any application/library that wants linux/types.h.
  */
 
+/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
 #ifdef __CHECKER__
 #define __bitwise	__attribute__((bitwise))
 #else
-- 
cgit v1.2.3


From 7ef66d186eb95f987a97fb3329b65c840e2dc9bf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 24 Mar 2022 06:53:18 -0600
Subject: io_uring: remove IORING_CQE_F_MSG

This was introduced with the message ring opcode, but isn't strictly
required for the request itself. The sender can encode what is needed
in user_data, which is passed to the receiver. It's unclear if having
a separate flag that essentially says "This CQE did not originate from
an SQE on this ring" provides any real utility to applications. While
we can always re-introduce a flag to provide this information, we cannot
take it away at a later point in time.

Remove the flag while we still can, before it's in a released kernel.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 3 +--
 include/uapi/linux/io_uring.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 88556e654c5a..28b7a1b8abb6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4474,8 +4474,7 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 	target_ctx = req->file->private_data;
 
 	spin_lock(&target_ctx->completion_lock);
-	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len,
-					IORING_CQE_F_MSG);
+	filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
 	io_commit_cqring(target_ctx);
 	spin_unlock(&target_ctx->completion_lock);
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d2be4eb22008..784adc6f6ed2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -201,11 +201,9 @@ struct io_uring_cqe {
  *
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
  * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
- * IORING_CQE_F_MSG	If set, CQE was generated with IORING_OP_MSG_RING
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
-#define IORING_CQE_F_MSG		(1U << 2)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
-- 
cgit v1.2.3


From 90a6951b58e935124eeb7ecd9fbc2426f841ac0c Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Tue, 15 Feb 2022 11:04:29 +0530
Subject: Add definition of VIRTIO_F_IN_ORDER feature bit

This patch adds the definition of VIRTIO_F_IN_ORDER feature bit
in the relevant header file to make it available in QEMU's
linux standard header file virtio_config.h, which is updated using
scripts/update-linux-headers.sh

Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Link: https://lore.kernel.org/r/20220215053430.24650-1-gdawar@xilinx.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/uapi/linux/virtio_config.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index b5eda06f0d57..f0fb0ae021c0 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -82,6 +82,12 @@
 /* This feature indicates support for the packed virtqueue layout. */
 #define VIRTIO_F_RING_PACKED		34
 
+/*
+ * Inorder feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER		35
+
 /*
  * This feature indicates that memory accesses by the driver and the
  * device are ordered in a way described by the platform.
-- 
cgit v1.2.3


From 13d640a3e9a3ac7ec694843d3d3b785e85fb8cb8 Mon Sep 17 00:00:00 2001
From: zhenwei pi <pizhenwei@bytedance.com>
Date: Wed, 2 Mar 2022 11:39:14 +0800
Subject: virtio_crypto: Introduce VIRTIO_CRYPTO_NOSPC

Base on the lastest virtio crypto spec, define VIRTIO_CRYPTO_NOSPC.

Reviewed-by: Gonglei <arei.gonglei@huawei.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
Link: https://lore.kernel.org/r/20220302033917.1295334-2-pizhenwei@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_crypto.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h
index a03932f10565..1166a49084b0 100644
--- a/include/uapi/linux/virtio_crypto.h
+++ b/include/uapi/linux/virtio_crypto.h
@@ -408,6 +408,7 @@ struct virtio_crypto_op_data_req {
 #define VIRTIO_CRYPTO_BADMSG    2
 #define VIRTIO_CRYPTO_NOTSUPP   3
 #define VIRTIO_CRYPTO_INVSESS   4 /* Invalid session id */
+#define VIRTIO_CRYPTO_NOSPC     5 /* no free session ID */
 
 /* The accelerator hardware is ready */
 #define VIRTIO_CRYPTO_S_HW_READY  (1 << 0)
-- 
cgit v1.2.3


From 24e19590628b58578748eeaec8140bf9c9dc00d9 Mon Sep 17 00:00:00 2001
From: zhenwei pi <pizhenwei@bytedance.com>
Date: Wed, 2 Mar 2022 11:39:15 +0800
Subject: virtio-crypto: introduce akcipher service

Introduce asymmetric service definition, asymmetric operations and
several well known algorithms.

Co-developed-by: lei he <helei.sig11@bytedance.com>
Signed-off-by: lei he <helei.sig11@bytedance.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
Link: https://lore.kernel.org/r/20220302033917.1295334-3-pizhenwei@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Gonglei <arei.gonglei@huawei.com>
---
 include/uapi/linux/virtio_crypto.h | 81 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h
index 1166a49084b0..71a54a6849ca 100644
--- a/include/uapi/linux/virtio_crypto.h
+++ b/include/uapi/linux/virtio_crypto.h
@@ -37,6 +37,7 @@
 #define VIRTIO_CRYPTO_SERVICE_HASH   1
 #define VIRTIO_CRYPTO_SERVICE_MAC    2
 #define VIRTIO_CRYPTO_SERVICE_AEAD   3
+#define VIRTIO_CRYPTO_SERVICE_AKCIPHER 4
 
 #define VIRTIO_CRYPTO_OPCODE(service, op)   (((service) << 8) | (op))
 
@@ -57,6 +58,10 @@ struct virtio_crypto_ctrl_header {
 	   VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x02)
 #define VIRTIO_CRYPTO_AEAD_DESTROY_SESSION \
 	   VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x03)
+#define VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION \
+	   VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x04)
+#define VIRTIO_CRYPTO_AKCIPHER_DESTROY_SESSION \
+	   VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x05)
 	__le32 opcode;
 	__le32 algo;
 	__le32 flag;
@@ -180,6 +185,58 @@ struct virtio_crypto_aead_create_session_req {
 	__u8 padding[32];
 };
 
+struct virtio_crypto_rsa_session_para {
+#define VIRTIO_CRYPTO_RSA_RAW_PADDING   0
+#define VIRTIO_CRYPTO_RSA_PKCS1_PADDING 1
+	__le32 padding_algo;
+
+#define VIRTIO_CRYPTO_RSA_NO_HASH   0
+#define VIRTIO_CRYPTO_RSA_MD2       1
+#define VIRTIO_CRYPTO_RSA_MD3       2
+#define VIRTIO_CRYPTO_RSA_MD4       3
+#define VIRTIO_CRYPTO_RSA_MD5       4
+#define VIRTIO_CRYPTO_RSA_SHA1      5
+#define VIRTIO_CRYPTO_RSA_SHA256    6
+#define VIRTIO_CRYPTO_RSA_SHA384    7
+#define VIRTIO_CRYPTO_RSA_SHA512    8
+#define VIRTIO_CRYPTO_RSA_SHA224    9
+	__le32 hash_algo;
+};
+
+struct virtio_crypto_ecdsa_session_para {
+#define VIRTIO_CRYPTO_CURVE_UNKNOWN   0
+#define VIRTIO_CRYPTO_CURVE_NIST_P192 1
+#define VIRTIO_CRYPTO_CURVE_NIST_P224 2
+#define VIRTIO_CRYPTO_CURVE_NIST_P256 3
+#define VIRTIO_CRYPTO_CURVE_NIST_P384 4
+#define VIRTIO_CRYPTO_CURVE_NIST_P521 5
+	__le32 curve_id;
+	__le32 padding;
+};
+
+struct virtio_crypto_akcipher_session_para {
+#define VIRTIO_CRYPTO_NO_AKCIPHER    0
+#define VIRTIO_CRYPTO_AKCIPHER_RSA   1
+#define VIRTIO_CRYPTO_AKCIPHER_DSA   2
+#define VIRTIO_CRYPTO_AKCIPHER_ECDSA 3
+	__le32 algo;
+
+#define VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PUBLIC  1
+#define VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PRIVATE 2
+	__le32 keytype;
+	__le32 keylen;
+
+	union {
+		struct virtio_crypto_rsa_session_para rsa;
+		struct virtio_crypto_ecdsa_session_para ecdsa;
+	} u;
+};
+
+struct virtio_crypto_akcipher_create_session_req {
+	struct virtio_crypto_akcipher_session_para para;
+	__u8 padding[36];
+};
+
 struct virtio_crypto_alg_chain_session_para {
 #define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER  1
 #define VIRTIO_CRYPTO_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH  2
@@ -247,6 +304,8 @@ struct virtio_crypto_op_ctrl_req {
 			mac_create_session;
 		struct virtio_crypto_aead_create_session_req
 			aead_create_session;
+		struct virtio_crypto_akcipher_create_session_req
+			akcipher_create_session;
 		struct virtio_crypto_destroy_session_req
 			destroy_session;
 		__u8 padding[56];
@@ -266,6 +325,14 @@ struct virtio_crypto_op_header {
 	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x00)
 #define VIRTIO_CRYPTO_AEAD_DECRYPT \
 	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AEAD, 0x01)
+#define VIRTIO_CRYPTO_AKCIPHER_ENCRYPT \
+	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x00)
+#define VIRTIO_CRYPTO_AKCIPHER_DECRYPT \
+	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x01)
+#define VIRTIO_CRYPTO_AKCIPHER_SIGN \
+	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x02)
+#define VIRTIO_CRYPTO_AKCIPHER_VERIFY \
+	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x03)
 	__le32 opcode;
 	/* algo should be service-specific algorithms */
 	__le32 algo;
@@ -390,6 +457,16 @@ struct virtio_crypto_aead_data_req {
 	__u8 padding[32];
 };
 
+struct virtio_crypto_akcipher_para {
+	__le32 src_data_len;
+	__le32 dst_data_len;
+};
+
+struct virtio_crypto_akcipher_data_req {
+	struct virtio_crypto_akcipher_para para;
+	__u8 padding[40];
+};
+
 /* The request of the data virtqueue's packet */
 struct virtio_crypto_op_data_req {
 	struct virtio_crypto_op_header header;
@@ -399,6 +476,7 @@ struct virtio_crypto_op_data_req {
 		struct virtio_crypto_hash_data_req hash_req;
 		struct virtio_crypto_mac_data_req mac_req;
 		struct virtio_crypto_aead_data_req aead_req;
+		struct virtio_crypto_akcipher_data_req akcipher_req;
 		__u8 padding[48];
 	} u;
 };
@@ -409,6 +487,7 @@ struct virtio_crypto_op_data_req {
 #define VIRTIO_CRYPTO_NOTSUPP   3
 #define VIRTIO_CRYPTO_INVSESS   4 /* Invalid session id */
 #define VIRTIO_CRYPTO_NOSPC     5 /* no free session ID */
+#define VIRTIO_CRYPTO_KEY_REJECTED 6 /* Signature verification failed */
 
 /* The accelerator hardware is ready */
 #define VIRTIO_CRYPTO_S_HW_READY  (1 << 0)
@@ -439,7 +518,7 @@ struct virtio_crypto_config {
 	__le32 max_cipher_key_len;
 	/* Maximum length of authenticated key */
 	__le32 max_auth_key_len;
-	__le32 reserve;
+	__le32 akcipher_algo;
 	/* Maximum size of each crypto request's content */
 	__le64 max_size;
 };
-- 
cgit v1.2.3


From a61280ddddaa45f95b60dd54c05f8e0e5b6810b7 Mon Sep 17 00:00:00 2001
From: Longpeng <longpeng2@huawei.com>
Date: Tue, 15 Mar 2022 11:25:51 +0800
Subject: vdpa: support exposing the config size to userspace

- GET_CONFIG_SIZE: return the size of the virtio config space.

The size contains the fields which are conditional on feature
bits.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Longpeng <longpeng2@huawei.com>
Link: https://lore.kernel.org/r/20220315032553.455-2-longpeng2@huawei.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vdpa.c       | 17 +++++++++++++++++
 include/linux/vdpa.h       |  3 ++-
 include/uapi/linux/vhost.h |  4 ++++
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 05f5fd2af58f..04375722c0e5 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -358,6 +358,20 @@ static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp)
 	return 0;
 }
 
+static long vhost_vdpa_get_config_size(struct vhost_vdpa *v, u32 __user *argp)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+	u32 size;
+
+	size = ops->get_config_size(vdpa);
+
+	if (copy_to_user(argp, &size, sizeof(size)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 				   void __user *argp)
 {
@@ -495,6 +509,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	case VHOST_VDPA_GET_IOVA_RANGE:
 		r = vhost_vdpa_get_iova_range(v, argp);
 		break;
+	case VHOST_VDPA_GET_CONFIG_SIZE:
+		r = vhost_vdpa_get_config_size(v, argp);
+		break;
 	default:
 		r = vhost_dev_ioctl(&v->vdev, cmd, argp);
 		if (r == -ENOIOCTLCMD)
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 721089bb4c84..a5269191edda 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -207,7 +207,8 @@ struct vdpa_map_file {
  * @reset:			Reset device
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
- * @get_config_size:		Get the size of the configuration space
+ * @get_config_size:		Get the size of the configuration space includes
+ *				fields that are conditional on feature bits.
  *				@vdev: vdpa device
  *				Returns size_t: configuration size
  * @get_config:			Read from device specific configuration space
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c998860d7bbc..bc74e95a273a 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -150,4 +150,8 @@
 /* Get the valid iova range */
 #define VHOST_VDPA_GET_IOVA_RANGE	_IOR(VHOST_VIRTIO, 0x78, \
 					     struct vhost_vdpa_iova_range)
+
+/* Get the config size */
+#define VHOST_VDPA_GET_CONFIG_SIZE	_IOR(VHOST_VIRTIO, 0x79, __u32)
+
 #endif
-- 
cgit v1.2.3


From b04d910af330b55e1d5d6eb9ecd53a375a9cf81c Mon Sep 17 00:00:00 2001
From: Longpeng <longpeng2@huawei.com>
Date: Tue, 15 Mar 2022 11:25:53 +0800
Subject: vdpa: support exposing the count of vqs to userspace

- GET_VQS_COUNT: the count of virtqueues that exposed

Signed-off-by: Longpeng <longpeng2@huawei.com>
Link: https://lore.kernel.org/r/20220315032553.455-4-longpeng2@huawei.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Longpeng &lt;<a href="mailto:longpeng2@huawei.com" target="_blank">longpeng2@huawei.com</a>&gt;<br>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vdpa.c       | 13 +++++++++++++
 include/uapi/linux/vhost.h |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 3b7fc2fe45f3..4c2f0bd06285 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -373,6 +373,16 @@ static long vhost_vdpa_get_config_size(struct vhost_vdpa *v, u32 __user *argp)
 	return 0;
 }
 
+static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+
+	if (copy_to_user(argp, &vdpa->nvqs, sizeof(vdpa->nvqs)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 				   void __user *argp)
 {
@@ -513,6 +523,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	case VHOST_VDPA_GET_CONFIG_SIZE:
 		r = vhost_vdpa_get_config_size(v, argp);
 		break;
+	case VHOST_VDPA_GET_VQS_COUNT:
+		r = vhost_vdpa_get_vqs_count(v, argp);
+		break;
 	default:
 		r = vhost_dev_ioctl(&v->vdev, cmd, argp);
 		if (r == -ENOIOCTLCMD)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index bc74e95a273a..5d99e7c242a2 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -154,4 +154,7 @@
 /* Get the config size */
 #define VHOST_VDPA_GET_CONFIG_SIZE	_IOR(VHOST_VIRTIO, 0x79, __u32)
 
+/* Get the count of all virtqueues */
+#define VHOST_VDPA_GET_VQS_COUNT	_IOR(VHOST_VIRTIO, 0x80, __u32)
+
 #endif
-- 
cgit v1.2.3


From f941c51eeac7ebe0f8ec30943bf78e7f60aad039 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Tue, 29 Mar 2022 20:18:15 +0000
Subject: loop: fix ioctl calls using compat_loop_info

Support for cryptoloop was deleted in commit 47e9624616c8 ("block:
remove support for cryptoloop and the xor transfer"), making the usage
of loop_info->lo_encrypt_type obsolete. However, this member was also
removed from the compat_loop_info definition and this breaks userspace
ioctl calls for 32-bit binaries and CONFIG_COMPAT=y.

This patch restores the compat_loop_info->lo_encrypt_type member and
marks it obsolete as well as in the uapi header definitions.

Fixes: 47e9624616c8 ("block: remove support for cryptoloop and the xor transfer")
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220329201815.1347500-1-cmllamas@google.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      | 1 +
 include/uapi/linux/loop.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e733c48de2e9..39e7650f89b1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1592,6 +1592,7 @@ struct compat_loop_info {
 	compat_ulong_t	lo_inode;       /* ioctl r/o */
 	compat_dev_t	lo_rdevice;     /* ioctl r/o */
 	compat_int_t	lo_offset;
+	compat_int_t	lo_encrypt_type;        /* obsolete, ignored */
 	compat_int_t	lo_encrypt_key_size;    /* ioctl w/o */
 	compat_int_t	lo_flags;       /* ioctl r/o */
 	char		lo_name[LO_NAME_SIZE];
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index 24a1c45bd1ae..98e60801195e 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -45,7 +45,7 @@ struct loop_info {
 	unsigned long	   lo_inode; 		/* ioctl r/o */
 	__kernel_old_dev_t lo_rdevice; 		/* ioctl r/o */
 	int		   lo_offset;
-	int		   lo_encrypt_type;
+	int		   lo_encrypt_type;		/* obsolete, ignored */
 	int		   lo_encrypt_key_size; 	/* ioctl w/o */
 	int		   lo_flags;
 	char		   lo_name[LO_NAME_SIZE];
@@ -61,7 +61,7 @@ struct loop_info64 {
 	__u64		   lo_offset;
 	__u64		   lo_sizelimit;/* bytes, 0 == max available */
 	__u32		   lo_number;			/* ioctl r/o */
-	__u32		   lo_encrypt_type;
+	__u32		   lo_encrypt_type;		/* obsolete, ignored */
 	__u32		   lo_encrypt_key_size;		/* ioctl w/o */
 	__u32		   lo_flags;
 	__u8		   lo_file_name[LO_NAME_SIZE];
-- 
cgit v1.2.3


From 55037ed7bdc62151a726f5685f88afa6a82959b1 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@linaro.org>
Date: Tue, 29 Mar 2022 10:12:52 -0700
Subject: uapi/linux/stddef.h: Add include guards

Add include guard wrapper define to uapi/linux/stddef.h to prevent macro
redefinition errors when stddef.h is included more than once. This was not
needed before since the only contents already used a redefinition test.

Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Link: https://lore.kernel.org/r/20220329171252.57279-1-tadeusz.struk@linaro.org
Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/stddef.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 3021ea25a284..7837ba4fe728 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_STDDEF_H
+#define _UAPI_LINUX_STDDEF_H
+
 #include <linux/compiler_types.h>
 
 #ifndef __always_inline
@@ -41,3 +44,4 @@
 		struct { } __empty_ ## NAME; \
 		TYPE NAME[]; \
 	}
+#endif
-- 
cgit v1.2.3


From 35025735a79eaa894c43837b94fd33c9d6b122df Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 3 Mar 2022 15:41:18 +0000
Subject: KVM: x86/xen: Support direct injection of event channel events

This adds a KVM_XEN_HVM_EVTCHN_SEND ioctl which allows direct injection
of events given an explicit { vcpu, port, priority } in precisely the
same form that those fields are given in the IRQ routing table.

Userspace is currently able to inject 2-level events purely by setting
the bits in the shared_info and vcpu_info, but FIFO event channels are
harder to deal with; we will need the kernel to take sole ownership of
delivery when we support those.

A patch advertising this feature with a new bit in the KVM_CAP_XEN_HVM
ioctl will be added in a subsequent patch.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-9-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       |  9 +++++++++
 arch/x86/kvm/xen.c       | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/xen.h       |  1 +
 include/uapi/linux/kvm.h |  3 +++
 4 files changed, 45 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index feefa1fb32eb..e6bc0b654bbb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6505,6 +6505,15 @@ set_pit2_out:
 		r = kvm_xen_hvm_set_attr(kvm, &xha);
 		break;
 	}
+	case KVM_XEN_HVM_EVTCHN_SEND: {
+		struct kvm_irq_routing_xen_evtchn uxe;
+
+		r = -EFAULT;
+		if (copy_from_user(&uxe, argp, sizeof(uxe)))
+			goto out;
+		r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+		break;
+	}
 #endif
 	case KVM_SET_CLOCK:
 		r = kvm_vm_ioctl_set_clock(kvm, argp);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 9c87263a5be2..f0f0011c4617 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1086,6 +1086,38 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
 	return 0;
 }
 
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+	struct kvm_xen_evtchn e;
+	int ret;
+
+	if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	e.port = uxe->port;
+	e.vcpu_id = uxe->vcpu;
+	e.vcpu_idx = -1;
+	e.priority = uxe->priority;
+
+	ret = kvm_xen_set_evtchn(&e, kvm);
+
+	/*
+	 * None of that 'return 1 if it actually got delivered' nonsense.
+	 * We don't care if it was masked (-ENOTCONN) either.
+	 */
+	if (ret > 0 || ret == -ENOTCONN)
+		ret = 0;
+
+	return ret;
+}
+
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index e28feb32add6..852286de574e 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -20,6 +20,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
 int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 91a6fe4e02c0..49cd2e9e0f6a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1699,6 +1699,9 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
 #define KVM_XEN_VCPU_SET_ATTR	_IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
 
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_HVM_EVTCHN_SEND	_IOW(KVMIO,  0xd0, struct kvm_irq_routing_xen_evtchn)
+
 #define KVM_GET_SREGS2             _IOR(KVMIO,  0xcc, struct kvm_sregs2)
 #define KVM_SET_SREGS2             _IOW(KVMIO,  0xcd, struct kvm_sregs2)
 
-- 
cgit v1.2.3


From 2fd6df2f2b47d4301b1ee0fe9d627d1c061a5988 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 3 Mar 2022 15:41:19 +0000
Subject: KVM: x86/xen: intercept EVTCHNOP_send from guests

Userspace registers a sending @port to either deliver to an @eventfd
or directly back to a local event channel port.

After binding events the guest or host may wish to bind those
events to a particular vcpu. This is usually done for unbound
and and interdomain events. Update requests are handled via the
KVM_XEN_EVTCHN_UPDATE flag.

Unregistered ports are handled by the emulator.

Co-developed-by: Ankur Arora <ankur.a.arora@oracle.com>
Co-developed-By: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-10-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/xen.c              | 295 ++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/kvm.h        |  28 ++++
 3 files changed, 309 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7808491ebba8..b20f7d99d702 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1024,6 +1024,7 @@ struct kvm_xen {
 	bool long_mode;
 	u8 upcall_vector;
 	struct gfn_to_pfn_cache shinfo_cache;
+	struct idr evtchn_ports;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f0f0011c4617..3d95167028ba 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -11,6 +11,7 @@
 #include "lapic.h"
 #include "hyperv.h"
 
+#include <linux/eventfd.h>
 #include <linux/kvm_host.h>
 #include <linux/sched/stat.h>
 
@@ -21,6 +22,9 @@
 
 #include "trace.h"
 
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
 static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
@@ -365,36 +369,44 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 {
 	int r = -ENOENT;
 
-	mutex_lock(&kvm->lock);
 
 	switch (data->type) {
 	case KVM_XEN_ATTR_TYPE_LONG_MODE:
 		if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
 			r = -EINVAL;
 		} else {
+			mutex_lock(&kvm->lock);
 			kvm->arch.xen.long_mode = !!data->u.long_mode;
+			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+		mutex_lock(&kvm->lock);
 		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+		mutex_unlock(&kvm->lock);
 		break;
 
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;
 		else {
+			mutex_lock(&kvm->lock);
 			kvm->arch.xen.upcall_vector = data->u.vector;
+			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
 		break;
 
+	case KVM_XEN_ATTR_TYPE_EVTCHN:
+		r = kvm_xen_setattr_evtchn(kvm, data);
+		break;
+
 	default:
 		break;
 	}
 
-	mutex_unlock(&kvm->lock);
 	return r;
 }
 
@@ -770,18 +782,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 	return 0;
 }
 
-void kvm_xen_init_vm(struct kvm *kvm)
-{
-}
-
-void kvm_xen_destroy_vm(struct kvm *kvm)
-{
-	kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
-
-	if (kvm->arch.xen_hvm_config.msr)
-		static_branch_slow_dec_deferred(&kvm_xen_enabled);
-}
-
 static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
 {
 	kvm_rax_write(vcpu, result);
@@ -801,7 +801,8 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 {
 	bool longmode;
-	u64 input, params[6];
+	u64 input, params[6], r = -ENOSYS;
+	bool handled = false;
 
 	input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
 
@@ -832,6 +833,19 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 	trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
 				params[3], params[4], params[5]);
 
+	switch (input) {
+	case __HYPERVISOR_event_channel_op:
+		if (params[0] == EVTCHNOP_send)
+			handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+		break;
+
+	default:
+		break;
+	}
+
+	if (handled)
+		return kvm_xen_hypercall_set_result(vcpu, r);
+
 	vcpu->run->exit_reason = KVM_EXIT_XEN;
 	vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
 	vcpu->run->xen.u.hcall.longmode = longmode;
@@ -1118,6 +1132,234 @@ int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *
 	return ret;
 }
 
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+	u32 send_port;
+	u32 type;
+	union {
+		struct kvm_xen_evtchn port;
+		struct {
+			u32 port; /* zero */
+			struct eventfd_ctx *ctx;
+		} eventfd;
+	} deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+				  struct kvm_xen_hvm_attr *data)
+{
+	u32 port = data->u.evtchn.send_port;
+	struct evtchnfd *evtchnfd;
+
+	if (!port || port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+	evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+	mutex_unlock(&kvm->lock);
+
+	if (!evtchnfd)
+		return -ENOENT;
+
+	/* For an UPDATE, nothing may change except the priority/vcpu */
+	if (evtchnfd->type != data->u.evtchn.type)
+		return -EINVAL;
+
+	/*
+	 * Port cannot change, and if it's zero that was an eventfd
+	 * which can't be changed either.
+	 */
+	if (!evtchnfd->deliver.port.port ||
+	    evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+	if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+		evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+		evtchnfd->deliver.port.vcpu_idx = -1;
+	}
+	mutex_unlock(&kvm->lock);
+	return 0;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+				  struct kvm_xen_hvm_attr *data)
+{
+	u32 port = data->u.evtchn.send_port;
+	struct eventfd_ctx *eventfd = NULL;
+	struct evtchnfd *evtchnfd = NULL;
+	int ret = -EINVAL;
+
+	if (!port || port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+	if (!evtchnfd)
+		return -ENOMEM;
+
+	switch(data->u.evtchn.type) {
+	case EVTCHNSTAT_ipi:
+		/* IPI  must map back to the same port# */
+		if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+			goto out; /* -EINVAL */
+		break;
+
+	case EVTCHNSTAT_interdomain:
+		if (data->u.evtchn.deliver.port.port) {
+			if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+				goto out; /* -EINVAL */
+		} else {
+			eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+			if (IS_ERR(eventfd)) {
+				ret = PTR_ERR(eventfd);
+				goto out;
+			}
+		}
+		break;
+
+	case EVTCHNSTAT_virq:
+	case EVTCHNSTAT_closed:
+	case EVTCHNSTAT_unbound:
+	case EVTCHNSTAT_pirq:
+	default: /* Unknown event channel type */
+		goto out; /* -EINVAL */
+	}
+
+	evtchnfd->send_port = data->u.evtchn.send_port;
+	evtchnfd->type = data->u.evtchn.type;
+	if (eventfd) {
+		evtchnfd->deliver.eventfd.ctx = eventfd;
+	} else {
+		/* We only support 2 level event channels for now */
+		if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+			goto out; /* -EINVAL; */
+
+		evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+		evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+		evtchnfd->deliver.port.vcpu_idx = -1;
+		evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+	}
+
+	mutex_lock(&kvm->lock);
+	ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+			GFP_KERNEL);
+	mutex_unlock(&kvm->lock);
+	if (ret >= 0)
+		return 0;
+
+	if (ret == -ENOSPC)
+		ret = -EEXIST;
+out:
+	if (eventfd)
+		eventfd_ctx_put(eventfd);
+	kfree(evtchnfd);
+	return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+	struct evtchnfd *evtchnfd;
+
+	mutex_lock(&kvm->lock);
+	evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+	mutex_unlock(&kvm->lock);
+
+	if (!evtchnfd)
+		return -ENOENT;
+
+	if (kvm)
+		synchronize_srcu(&kvm->srcu);
+	if (!evtchnfd->deliver.port.port)
+		eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+	kfree(evtchnfd);
+	return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+	struct evtchnfd *evtchnfd;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+		idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+		synchronize_srcu(&kvm->srcu);
+		if (!evtchnfd->deliver.port.port)
+			eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+		kfree(evtchnfd);
+	}
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+	u32 port = data->u.evtchn.send_port;
+
+	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+		return kvm_xen_eventfd_reset(kvm);
+
+	if (!port || port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+		return kvm_xen_eventfd_deassign(kvm, port);
+	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+		return kvm_xen_eventfd_update(kvm, data);
+	if (data->u.evtchn.flags)
+		return -EINVAL;
+
+	return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+	struct evtchnfd *evtchnfd;
+	struct evtchn_send send;
+	gpa_t gpa;
+	int idx;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+		*r = -EFAULT;
+		return true;
+	}
+
+	/* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+	evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+	if (!evtchnfd)
+		return false;
+
+	if (evtchnfd->deliver.port.port) {
+		int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+		if (ret < 0 && ret != -ENOTCONN)
+			return false;
+	} else {
+		eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+	}
+
+	*r = 0;
+	return true;
+}
+
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
@@ -1127,3 +1369,26 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.vcpu_time_info_cache);
 }
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+	idr_init(&kvm->arch.xen.evtchn_ports);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+	struct evtchnfd *evtchnfd;
+	int i;
+
+	kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
+	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+		if (!evtchnfd->deliver.port.port)
+			eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+		kfree(evtchnfd);
+	}
+	idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+	if (kvm->arch.xen_hvm_config.msr)
+		static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 49cd2e9e0f6a..623ed2cb228f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1686,6 +1686,32 @@ struct kvm_xen_hvm_attr {
 		struct {
 			__u64 gfn;
 		} shared_info;
+		struct {
+			__u32 send_port;
+			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+			__u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN		(1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE		(1 << 1)
+#define KVM_XEN_EVTCHN_RESET		(1 << 2)
+			/*
+			 * Events sent by the guest are either looped back to
+			 * the guest itself (potentially on a different port#)
+			 * or signalled via an eventfd.
+			 */
+			union {
+				struct {
+					__u32 port;
+					__u32 vcpu;
+					__u32 priority;
+				} port;
+				struct {
+					__u32 port; /* Zero for eventfd */
+					__s32 fd;
+				} eventfd;
+				__u32 padding[4];
+			} deliver;
+		} evtchn;
+
 		__u64 pad[8];
 	} u;
 };
@@ -1694,6 +1720,8 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_ATTR_TYPE_LONG_MODE		0x0
 #define KVM_XEN_ATTR_TYPE_SHARED_INFO		0x1
 #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR		0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN		0x3
 
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
-- 
cgit v1.2.3


From 942c2490c23f2800ad8143f5eb84a79b859aa743 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 3 Mar 2022 15:41:21 +0000
Subject: KVM: x86/xen: Add KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID

In order to intercept hypercalls such as VCPUOP_set_singleshot_timer, we
need to be aware of the Xen CPU numbering.

This looks a lot like the Hyper-V handling of vpidx, for obvious reasons.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-12-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              |  1 +
 arch/x86/kvm/xen.c              | 19 +++++++++++++++++++
 arch/x86/kvm/xen.h              |  5 +++++
 include/uapi/linux/kvm.h        |  3 +++
 5 files changed, 29 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b20f7d99d702..9e3408542b41 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -613,6 +613,7 @@ struct kvm_vcpu_xen {
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
 	unsigned long evtchn_pending_sel;
+	u32 vcpu_id; /* The Xen / ACPI vCPU ID */
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6bc0b654bbb..3d7b65f33bd8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11262,6 +11262,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+	kvm_xen_init_vcpu(vcpu);
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
 	kvm_set_tsc_khz(vcpu, max_tsc_khz);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 0dae583c4a1d..244cf3cd858a 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -603,6 +603,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+		if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+			r = -EINVAL;
+		else {
+			vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+			r = 0;
+		}
+		break;
+
 	default:
 		break;
 	}
@@ -678,6 +687,11 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = -EINVAL;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+		data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -1377,6 +1391,11 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
 	return true;
 }
 
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+}
+
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 852286de574e..54d587aae85b 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -25,6 +25,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
 int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
 			    struct kvm *kvm);
@@ -75,6 +76,10 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
 {
 }
 
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
 static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 623ed2cb228f..4b65e9f0a4d9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1747,6 +1747,7 @@ struct kvm_xen_vcpu_attr {
 			__u64 time_blocked;
 			__u64 time_offline;
 		} runstate;
+		__u32 vcpu_id;
 	} u;
 };
 
@@ -1757,6 +1758,8 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
-- 
cgit v1.2.3


From 536395260582be7443b0b35b0bbb89ffe3947f62 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 3 Mar 2022 15:41:22 +0000
Subject: KVM: x86/xen: handle PV timers oneshot mode

If the guest has offloaded the timer virq, handle the following
hypercalls for programming the timer:

    VCPUOP_set_singleshot_timer
    VCPUOP_stop_singleshot_timer
    set_timer_op(timestamp_ns)

The event channel corresponding to the timer virq is then used to inject
events once timer deadlines are met. For now we back the PV timer with
hrtimer.

[ dwmw2: Add save/restore, 32-bit compat mode, immediate delivery,
         don't check timer in kvm_vcpu_has_event() ]

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-13-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   4 +
 arch/x86/kvm/irq.c              |  10 +-
 arch/x86/kvm/xen.c              | 211 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/xen.h              |  28 ++++++
 include/uapi/linux/kvm.h        |   6 ++
 5 files changed, 257 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e3408542b41..8193d5285544 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -614,6 +614,10 @@ struct kvm_vcpu_xen {
 	u64 runstate_times[4];
 	unsigned long evtchn_pending_sel;
 	u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+	u32 timer_virq;
+	u64 timer_expires; /* In guest epoch */
+	atomic_t timer_pending;
+	struct hrtimer timer;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 172b05343cfd..f371f1292ca3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,10 +22,14 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
+	int r = 0;
+
 	if (lapic_in_kernel(vcpu))
-		return apic_has_pending_timer(vcpu);
+		r = apic_has_pending_timer(vcpu);
+	if (kvm_xen_timer_enabled(vcpu))
+		r += kvm_xen_has_pending_timer(vcpu);
 
-	return 0;
+	return r;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	if (lapic_in_kernel(vcpu))
 		kvm_inject_apic_timer_irqs(vcpu);
+	if (kvm_xen_timer_enabled(vcpu))
+		kvm_xen_inject_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 244cf3cd858a..0e1595dc8c5f 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -23,6 +23,7 @@
 
 #include "trace.h"
 
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
 static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
 
@@ -108,6 +109,66 @@ out:
 	return ret;
 }
 
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+	if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+		struct kvm_xen_evtchn e;
+
+		e.vcpu_id = vcpu->vcpu_id;
+		e.vcpu_idx = vcpu->vcpu_idx;
+		e.port = vcpu->arch.xen.timer_virq;
+		e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+		vcpu->arch.xen.timer_expires = 0;
+		atomic_set(&vcpu->arch.xen.timer_pending, 0);
+	}
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+					     arch.xen.timer);
+	if (atomic_read(&vcpu->arch.xen.timer_pending))
+		return HRTIMER_NORESTART;
+
+	atomic_inc(&vcpu->arch.xen.timer_pending);
+	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+	atomic_set(&vcpu->arch.xen.timer_pending, 0);
+	vcpu->arch.xen.timer_expires = guest_abs;
+
+	if (delta_ns <= 0) {
+		xen_timer_callback(&vcpu->arch.xen.timer);
+	} else {
+		ktime_t ktime_now = ktime_get();
+		hrtimer_start(&vcpu->arch.xen.timer,
+			      ktime_add_ns(ktime_now, delta_ns),
+			      HRTIMER_MODE_ABS_HARD);
+	}
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+	hrtimer_cancel(&vcpu->arch.xen.timer);
+	vcpu->arch.xen.timer_expires = 0;
+	atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+	hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS_HARD);
+	vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
 static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -612,6 +673,28 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		}
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+		if (data->u.timer.port) {
+			if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+				r = -EINVAL;
+				break;
+			}
+			vcpu->arch.xen.timer_virq = data->u.timer.port;
+			kvm_xen_init_timer(vcpu);
+
+			/* Restart the timer if it's set */
+			if (data->u.timer.expires_ns)
+				kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+						    data->u.timer.expires_ns -
+						    get_kvmclock_ns(vcpu->kvm));
+		} else if (kvm_xen_timer_enabled(vcpu)) {
+			kvm_xen_stop_timer(vcpu);
+			vcpu->arch.xen.timer_virq = 0;
+		}
+
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -692,6 +775,13 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+		data->u.timer.port = vcpu->arch.xen.timer_virq;
+		data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+		data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -827,6 +917,112 @@ static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, int cmd, u64 param, u6
 	return false;
 }
 
+struct compat_vcpu_set_singleshot_timer {
+    uint64_t timeout_abs_ns;
+    uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+				  int vcpu_id, u64 param, u64 *r)
+{
+	struct vcpu_set_singleshot_timer oneshot;
+	s64 delta;
+	gpa_t gpa;
+	int idx;
+
+	if (!kvm_xen_timer_enabled(vcpu))
+		return false;
+
+	switch (cmd) {
+	case VCPUOP_set_singleshot_timer:
+		if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+			*r = -EINVAL;
+			return true;
+		}
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+		/*
+		 * The only difference for 32-bit compat is the 4 bytes of
+		 * padding after the interesting part of the structure. So
+		 * for a faithful emulation of Xen we have to *try* to copy
+		 * the padding and return -EFAULT if we can't. Otherwise we
+		 * might as well just have copied the 12-byte 32-bit struct.
+		 */
+		BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+			     offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+			     sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+		BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+			     offsetof(struct vcpu_set_singleshot_timer, flags));
+		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+			     sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+		if (!gpa ||
+		    kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
+					sizeof(struct compat_vcpu_set_singleshot_timer))) {
+			*r = -EFAULT;
+			return true;
+		}
+
+		delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+		if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+			*r = -ETIME;
+			return true;
+		}
+
+		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+		*r = 0;
+		return true;
+
+	case VCPUOP_stop_singleshot_timer:
+		if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+			*r = -EINVAL;
+			return true;
+		}
+		kvm_xen_stop_timer(vcpu);
+		*r = 0;
+		return true;
+	}
+
+	return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+				       u64 *r)
+{
+	if (!kvm_xen_timer_enabled(vcpu))
+		return false;
+
+	if (timeout) {
+		uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+		int64_t delta = timeout - guest_now;
+
+		/* Xen has a 'Linux workaround' in do_set_timer_op() which
+		 * checks for negative absolute timeout values (caused by
+		 * integer overflow), and for values about 13 days in the
+		 * future (2^50ns) which would be caused by jiffies
+		 * overflow. For those cases, it sets the timeout 100ms in
+		 * the future (not *too* soon, since if a guest really did
+		 * set a long timeout on purpose we don't want to keep
+		 * churning CPU time by waking it up).
+		 */
+		if (unlikely((int64_t)timeout < 0 ||
+			     (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+			delta = 100 * NSEC_PER_MSEC;
+			timeout = guest_now + delta;
+		}
+
+		kvm_xen_start_timer(vcpu, timeout, delta);
+	} else {
+		kvm_xen_stop_timer(vcpu);
+	}
+
+	*r = 0;
+	return true;
+}
+
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 {
 	bool longmode;
@@ -870,6 +1066,18 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 	case __HYPERVISOR_sched_op:
 		handled = kvm_xen_hcall_sched_op(vcpu, params[0], params[1], &r);
 		break;
+	case __HYPERVISOR_vcpu_op:
+		handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+						params[2], &r);
+		break;
+	case __HYPERVISOR_set_timer_op: {
+		u64 timeout = params[0];
+		/* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+		if (!longmode)
+			timeout |= params[1] << 32;
+		handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+		break;
+	}
 	default:
 		break;
 	}
@@ -1398,6 +1606,9 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 
 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
+	if (kvm_xen_timer_enabled(vcpu))
+		kvm_xen_stop_timer(vcpu);
+
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.runstate_cache);
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 54d587aae85b..ee5c4ae0755c 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -62,6 +62,20 @@ static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 		vcpu->arch.xen.evtchn_pending_sel;
 }
 
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+	return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+		return atomic_read(&vcpu->arch.xen.timer_pending);
+
+	return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
 #else
 static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
@@ -107,6 +121,20 @@ static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 {
 	return false;
 }
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
 #endif
 
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b65e9f0a4d9..cb223e425223 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1748,6 +1748,11 @@ struct kvm_xen_vcpu_attr {
 			__u64 time_offline;
 		} runstate;
 		__u32 vcpu_id;
+		struct {
+			__u32 port;
+			__u32 priority;
+			__u64 expires_ns;
+		} timer;
 	} u;
 };
 
@@ -1760,6 +1765,7 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
-- 
cgit v1.2.3


From 28d1629f751c4a5f9437fbaa0ee4ed81d1a8e587 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 3 Mar 2022 15:41:23 +0000
Subject: KVM: x86/xen: Kernel acceleration for XENVER_version

Turns out this is a fast path for PV guests because they use it to
trigger the event channel upcall. So letting it bounce all the way up
to userspace is not great.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-14-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/xen.c              | 19 +++++++++++++++++++
 include/uapi/linux/kvm.h        |  3 ++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8193d5285544..f3fba9d8ddc6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1026,6 +1026,7 @@ struct msr_bitmap_range {
 
 /* Xen emulation context */
 struct kvm_xen {
+	u32 xen_version;
 	bool long_mode;
 	u8 upcall_vector;
 	struct gfn_to_pfn_cache shinfo_cache;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 0e1595dc8c5f..98438f27f6b3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -18,6 +18,7 @@
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
 #include <xen/interface/event_channel.h>
 #include <xen/interface/sched.h>
 
@@ -465,6 +466,13 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		r = kvm_xen_setattr_evtchn(kvm, data);
 		break;
 
+	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+		mutex_lock(&kvm->lock);
+		kvm->arch.xen.xen_version = data->u.xen_version;
+		mutex_unlock(&kvm->lock);
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -497,6 +505,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+		data->u.xen_version = kvm->arch.xen.xen_version;
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -1059,6 +1072,12 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 				params[3], params[4], params[5]);
 
 	switch (input) {
+	case __HYPERVISOR_xen_version:
+		if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+			r = vcpu->kvm->arch.xen.xen_version;
+			handled = true;
+		}
+		break;
 	case __HYPERVISOR_event_channel_op:
 		if (params[0] == EVTCHNOP_send)
 			handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cb223e425223..4dda3896ed71 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1711,7 +1711,7 @@ struct kvm_xen_hvm_attr {
 				__u32 padding[4];
 			} deliver;
 		} evtchn;
-
+		__u32 xen_version;
 		__u64 pad[8];
 	} u;
 };
@@ -1722,6 +1722,7 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR		0x2
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_ATTR_TYPE_EVTCHN		0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION		0x4
 
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
-- 
cgit v1.2.3


From fde0451be8fb3208d4d146b8602d99ee8139e515 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 3 Mar 2022 15:41:24 +0000
Subject: KVM: x86/xen: Support per-vCPU event channel upcall via local APIC

Windows uses a per-vCPU vector, and it's delivered via the local APIC
basically like an MSI (with associated EOI) unlike the traditional
guest-wide vector which is just magically asserted by Xen (and in the
KVM case by kvm_xen_has_interrupt() / kvm_cpu_get_extint()).

Now that the kernel is able to raise event channel events for itself,
being able to do so for Windows guests is also going to be useful.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-15-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/xen.c              | 40 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  2 ++
 3 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f3fba9d8ddc6..998caf7a3ce9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -606,6 +606,7 @@ struct kvm_vcpu_hv {
 struct kvm_vcpu_xen {
 	u64 hypercall_rip;
 	u32 current_runstate;
+	u8 upcall_vector;
 	struct gfn_to_pfn_cache vcpu_info_cache;
 	struct gfn_to_pfn_cache vcpu_time_info_cache;
 	struct gfn_to_pfn_cache runstate_cache;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 98438f27f6b3..5afaf7b59944 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -314,6 +314,22 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 }
 
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+	struct kvm_lapic_irq irq = { };
+	int r;
+
+	irq.dest_id = v->vcpu_id;
+	irq.vector = v->arch.xen.upcall_vector;
+	irq.dest_mode = APIC_DEST_PHYSICAL;
+	irq.shorthand = APIC_DEST_NOSHORT;
+	irq.delivery_mode = APIC_DM_FIXED;
+	irq.level = 1;
+
+	/* The fast version will always work for physical unicast */
+	WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
 /*
  * On event channel delivery, the vcpu_info may not have been accessible.
  * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
@@ -374,6 +390,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 	}
 	read_unlock_irqrestore(&gpc->lock, flags);
 
+	/* For the per-vCPU lapic vector, deliver it as MSI. */
+	if (v->arch.xen.upcall_vector)
+		kvm_xen_inject_vcpu_vector(v);
+
 	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 }
 
@@ -708,6 +728,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+		if (data->u.vector && data->u.vector < 0x10)
+			r = -EINVAL;
+		else {
+			vcpu->arch.xen.upcall_vector = data->u.vector;
+			r = 0;
+		}
+		break;
+
 	default:
 		break;
 	}
@@ -795,6 +824,11 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		r = 0;
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+		data->u.vector = vcpu->arch.xen.upcall_vector;
+		r = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -1228,6 +1262,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 				kick_vcpu = true;
 			}
 		}
+
+		/* For the per-vCPU lapic vector, deliver it as MSI. */
+		if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+			kvm_xen_inject_vcpu_vector(vcpu);
+			kick_vcpu = false;
+		}
 	}
 
  out_rcu:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4dda3896ed71..a9ba690c4f37 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1754,6 +1754,7 @@ struct kvm_xen_vcpu_attr {
 			__u32 priority;
 			__u64 expires_ns;
 		} timer;
+		__u8 vector;
 	} u;
 };
 
@@ -1767,6 +1768,7 @@ struct kvm_xen_vcpu_attr {
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
 #define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
-- 
cgit v1.2.3


From 661a20fab7d156cf6b9a407c946a1e558a633151 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 3 Mar 2022 15:41:25 +0000
Subject: KVM: x86/xen: Advertise and document KVM_XEN_HVM_CONFIG_EVTCHN_SEND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At the end of the patch series adding this batch of event channel
acceleration features, finally add the feature bit which advertises
them and document it all.

For SCHEDOP_poll we need to wake a polling vCPU when a given port
is triggered, even when it's masked — and we want to implement that
in the kernel, for efficiency. So we want the kernel to know that it
has sole ownership of event channel delivery. Thus, we allow
userspace to make the 'promise' by setting the corresponding feature
bit in its KVM_XEN_HVM_CONFIG call. As we implement SCHEDOP_poll
bypass later, we will do so only if that promise has been made by
userspace.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220303154127.202856-16-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 129 +++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.c             |   3 +-
 arch/x86/kvm/xen.c             |   6 +-
 include/uapi/linux/kvm.h       |   1 +
 4 files changed, 127 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 700df350332b..fdfd9e1bc7f7 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -982,12 +982,22 @@ memory.
 	__u8 pad2[30];
   };
 
-If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
-KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
-This requests KVM to generate the contents of the hypercall page
-automatically; hypercalls will be intercepted and passed to userspace
-through KVM_EXIT_XEN.  In this case, all of the blob size and address
-fields must be zero.
+If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
+be set in the flags field of this ioctl:
+
+The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
+the contents of the hypercall page automatically; hypercalls will be
+intercepted and passed to userspace through KVM_EXIT_XEN.  In this
+ase, all of the blob size and address fields must be zero.
+
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
+will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
+channel interrupts rather than manipulating the guest's shared_info
+structures directly. This, in turn, may allow KVM to enable features
+such as intercepting the SCHEDOP_poll hypercall to accelerate PV
+spinlock operation for the guest. Userspace may still use the ioctl
+to deliver events if it was advertised, even if userspace does not
+send this indication that it will always do so
 
 No other flags are currently valid in the struct kvm_xen_hvm_config.
 
@@ -5216,7 +5226,25 @@ have deterministic behavior.
 		struct {
 			__u64 gfn;
 		} shared_info;
-		__u64 pad[4];
+		struct {
+			__u32 send_port;
+			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+			__u32 flags;
+			union {
+				struct {
+					__u32 port;
+					__u32 vcpu;
+					__u32 priority;
+				} port;
+				struct {
+					__u32 port; /* Zero for eventfd */
+					__s32 fd;
+				} eventfd;
+				__u32 padding[4];
+			} deliver;
+		} evtchn;
+		__u32 xen_version;
+		__u64 pad[8];
 	} u;
   };
 
@@ -5247,6 +5275,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
+  This is the HVM-wide vector injected directly by the hypervisor
+  (not through the local APIC), typically configured by a guest via
+  HVM_PARAM_CALLBACK_IRQ.
+
+KVM_XEN_ATTR_TYPE_EVTCHN
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+  an outbound port number for interception of EVTCHNOP_send requests
+  from the guest. A given sending port number may be directed back
+  to a specified vCPU (by APIC ID) / port / priority on the guest,
+  or to trigger events on an eventfd. The vCPU and priority can be
+  changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
+  but other fields cannot change for a given sending port. A port
+  mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
+  field.
+
+KVM_XEN_ATTR_TYPE_XEN_VERSION
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
+  the 32-bit version code returned to the guest when it invokes the
+  XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
+  Xen guests will often use this to as a dummy hypercall to trigger
+  event channel delivery, so responding within the kernel without
+  exiting to userspace is beneficial.
 
 4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
@@ -5258,7 +5310,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 :Returns: 0 on success, < 0 on error
 
 Allows Xen VM attributes to be read. For the structure and types,
-see KVM_XEN_HVM_SET_ATTR above.
+see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
+attribute cannot be read.
 
 4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
@@ -5285,6 +5338,13 @@ see KVM_XEN_HVM_SET_ATTR above.
 			__u64 time_blocked;
 			__u64 time_offline;
 		} runstate;
+		__u32 vcpu_id;
+		struct {
+			__u32 port;
+			__u32 priority;
+			__u64 expires_ns;
+		} timer;
+		__u8 vector;
 	} u;
   };
 
@@ -5326,6 +5386,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
   or RUNSTATE_offline) to set the current accounted state as of the
   adjusted state_entry_time.
 
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
+  vCPU ID of the given vCPU, to allow timer-related VCPU operations to
+  be intercepted by KVM.
+
+KVM_XEN_VCPU_ATTR_TYPE_TIMER
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+  event channel port/priority for the VIRQ_TIMER of the vCPU, as well
+  as allowing a pending timer to be saved/restored.
+
+KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
+  per-vCPU local APIC upcall vector, configured by a Xen guest with
+  the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
+  used by Windows guests, and is distinct from the HVM-wide upcall
+  vector configured with HVM_PARAM_CALLBACK_IRQ.
+
+
 4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
@@ -5645,6 +5726,25 @@ enabled with ``arch_prctl()``, but this may change in the future.
 The offsets of the state save areas in struct kvm_xsave follow the contents
 of CPUID leaf 0xD on the host.
 
+4.135 KVM_XEN_HVM_EVTCHN_SEND
+-----------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_irq_routing_xen_evtchn
+:Returns: 0 on success, < 0 on error
+
+
+::
+
+   struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+   };
+
+This ioctl injects an event channel interrupt directly to the guest vCPU.
 
 5. The kvm_run structure
 ========================
@@ -7620,8 +7720,9 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
-  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
-  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 3)
+  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND	(1 << 5)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -7645,6 +7746,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
 of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
 field set to indicate 2 level event channel delivery.
 
+The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
+injecting event channel events directly into the guest with the
+KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
+KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
+related to event channel delivery, timers, and the XENVER_version
+interception.
+
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d7b65f33bd8..1a7d1b5f7232 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4276,7 +4276,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
-		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 5afaf7b59944..53232c9ff89c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -909,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 {
-	if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+	/* Only some feature flags need to be *enabled* by userspace */
+	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+		KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+	if (xhc->flags & ~permitted_flags)
 		return -EINVAL;
 
 	/*
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a9ba690c4f37..ee5cc9e2a837 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1232,6 +1232,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
-- 
cgit v1.2.3


From ffbb61d09fc56c85e28b110494f3788d0ed4d1f8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 25 Feb 2022 14:53:02 +0000
Subject: KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl.

This sets the default TSC frequency for subsequently created vCPUs.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20220225145304.36166-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 11 +++++++----
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 26 +++++++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  4 +++-
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index fdfd9e1bc7f7..b102ba7cf903 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1897,22 +1897,25 @@ the future.
 4.55 KVM_SET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: virtual tsc_khz
 :Returns: 0 on success, -1 on error
 
 Specifies the tsc frequency for the virtual machine. The unit of the
 frequency is KHz.
 
+If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+be used as a vm ioctl to set the initial tsc frequency of subsequently
+created vCPUs.
 
 4.56 KVM_GET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: none
 :Returns: virtual tsc-khz on success, negative value on error
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5370744b789c..fac990cc189d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1126,6 +1126,8 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 
+	u32 default_tsc_khz;
+
 	seqcount_raw_spinlock_t pvclock_sc;
 	bool use_master_clock;
 	u64 master_kernel_ns;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a7d1b5f7232..c069b97a8523 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = boot_cpu_has(X86_FEATURE_XSAVE);
 		break;
 	case KVM_CAP_TSC_CONTROL:
+	case KVM_CAP_VM_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
 	case KVM_CAP_X2APIC_API:
@@ -6522,6 +6523,28 @@ set_pit2_out:
 	case KVM_GET_CLOCK:
 		r = kvm_vm_ioctl_get_clock(kvm, argp);
 		break;
+	case KVM_SET_TSC_KHZ: {
+		u32 user_tsc_khz;
+
+		r = -EINVAL;
+		user_tsc_khz = (u32)arg;
+
+		if (kvm_has_tsc_control &&
+		    user_tsc_khz >= kvm_max_guest_tsc_khz)
+			goto out;
+
+		if (user_tsc_khz == 0)
+			user_tsc_khz = tsc_khz;
+
+		WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+		r = 0;
+
+		goto out;
+	}
+	case KVM_GET_TSC_KHZ: {
+		r = READ_ONCE(kvm->arch.default_tsc_khz);
+		goto out;
+	}
 	case KVM_MEMORY_ENCRYPT_OP: {
 		r = -ENOTTY;
 		if (!kvm_x86_ops.mem_enc_ioctl)
@@ -11266,7 +11289,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	kvm_xen_init_vcpu(vcpu);
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
-	kvm_set_tsc_khz(vcpu, max_tsc_khz);
+	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
 	kvm_vcpu_reset(vcpu, false);
 	kvm_init_mmu(vcpu);
 	vcpu_put(vcpu);
@@ -11714,6 +11737,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	pvclock_update_vm_gtod_copy(kvm);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
+	kvm->arch.default_tsc_khz = max_tsc_khz;
 	kvm->arch.guest_can_read_msr_platform_info = true;
 	kvm->arch.enable_pmu = enable_pmu;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ee5cc9e2a837..8616af85dc5d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1144,6 +1144,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
 #define KVM_CAP_DISABLE_QUIRKS2 213
+#define KVM_CAP_VM_TSC_CONTROL 214
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1471,7 +1472,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
-/* Available with KVM_CAP_TSC_CONTROL */
+/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
+*  KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
 /* Available with KVM_CAP_PCI_2_3 */
-- 
cgit v1.2.3


From 1cd927ad6f62f27d8908498dcbf61395c5dd5fe2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 1 Apr 2022 14:39:03 -0400
Subject: tracing: mark user_events as BROKEN

After being merged, user_events become more visible to a wider audience
that have concerns with the current API.

It is too late to fix this for this release, but instead of a full
revert, just mark it as BROKEN (which prevents it from being selected in
make config).  Then we can work finding a better API.  If that fails,
then it will need to be completely reverted.

To not have the code silently bitrot, still allow building it with
COMPILE_TEST.

And to prevent the uapi header from being installed, then later changed,
and then have an old distro user space see the old version, move the
header file out of the uapi directory.

Surround the include with CONFIG_COMPILE_TEST to the current location,
but when the BROKEN tag is taken off, it will use the uapi directory,
and fail to compile.  This is a good way to remind us to move the header
back.

Link: https://lore.kernel.org/all/20220330155835.5e1f6669@gandalf.local.home
Link: https://lkml.kernel.org/r/20220330201755.29319-1-mathieu.desnoyers@efficios.com
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_events.h      | 116 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/user_events.h | 116 ---------------------------------------
 kernel/trace/Kconfig             |   1 +
 kernel/trace/trace_events_user.c |   5 ++
 4 files changed, 122 insertions(+), 116 deletions(-)
 create mode 100644 include/linux/user_events.h
 delete mode 100644 include/uapi/linux/user_events.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
new file mode 100644
index 000000000000..e570840571e1
--- /dev/null
+++ b/include/linux/user_events.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+#ifndef _UAPI_LINUX_USER_EVENTS_H
+#define _UAPI_LINUX_USER_EVENTS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifdef __KERNEL__
+#include <linux/uio.h>
+#else
+#include <sys/uio.h>
+#endif
+
+#define USER_EVENTS_SYSTEM "user_events"
+#define USER_EVENTS_PREFIX "u:"
+
+/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
+#define EVENT_BIT_FTRACE 0
+#define EVENT_BIT_PERF 1
+#define EVENT_BIT_OTHER 7
+
+#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
+#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
+#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
+
+/* Create dynamic location entry within a 32-bit value */
+#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+
+/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */
+#define FLAG_BPF_ITER (1 << 0)
+
+/*
+ * Describes an event registration and stores the results of the registration.
+ * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
+ * must set the size and name_args before invocation.
+ */
+struct user_reg {
+
+	/* Input: Size of the user_reg structure being used */
+	__u32 size;
+
+	/* Input: Pointer to string with event name, description and flags */
+	__u64 name_args;
+
+	/* Output: Byte index of the event within the status page */
+	__u32 status_index;
+
+	/* Output: Index of the event to use when writing data */
+	__u32 write_index;
+};
+
+#define DIAG_IOC_MAGIC '*'
+
+/* Requests to register a user_event */
+#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
+
+/* Requests to delete a user_event */
+#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
+
+/* Data type that was passed to the BPF program */
+enum {
+	/* Data resides in kernel space */
+	USER_BPF_DATA_KERNEL,
+
+	/* Data resides in user space */
+	USER_BPF_DATA_USER,
+
+	/* Data is a pointer to a user_bpf_iter structure */
+	USER_BPF_DATA_ITER,
+};
+
+/*
+ * Describes an iovec iterator that BPF programs can use to access data for
+ * a given user_event write() / writev() call.
+ */
+struct user_bpf_iter {
+
+	/* Offset of the data within the first iovec */
+	__u32 iov_offset;
+
+	/* Number of iovec structures */
+	__u32 nr_segs;
+
+	/* Pointer to iovec structures */
+	const struct iovec *iov;
+};
+
+/* Context that BPF programs receive when attached to a user_event */
+struct user_bpf_context {
+
+	/* Data type being passed (see union below) */
+	__u32 data_type;
+
+	/* Length of the data */
+	__u32 data_len;
+
+	/* Pointer to data, varies by data type */
+	union {
+		/* Kernel data (data_type == USER_BPF_DATA_KERNEL) */
+		void *kdata;
+
+		/* User data (data_type == USER_BPF_DATA_USER) */
+		void *udata;
+
+		/* Direct iovec (data_type == USER_BPF_DATA_ITER) */
+		struct user_bpf_iter *iter;
+	};
+};
+
+#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
deleted file mode 100644
index e570840571e1..000000000000
--- a/include/uapi/linux/user_events.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (c) 2021, Microsoft Corporation.
- *
- * Authors:
- *   Beau Belgrave <beaub@linux.microsoft.com>
- */
-#ifndef _UAPI_LINUX_USER_EVENTS_H
-#define _UAPI_LINUX_USER_EVENTS_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#ifdef __KERNEL__
-#include <linux/uio.h>
-#else
-#include <sys/uio.h>
-#endif
-
-#define USER_EVENTS_SYSTEM "user_events"
-#define USER_EVENTS_PREFIX "u:"
-
-/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
-#define EVENT_BIT_FTRACE 0
-#define EVENT_BIT_PERF 1
-#define EVENT_BIT_OTHER 7
-
-#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
-#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
-#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
-
-/* Create dynamic location entry within a 32-bit value */
-#define DYN_LOC(offset, size) ((size) << 16 | (offset))
-
-/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */
-#define FLAG_BPF_ITER (1 << 0)
-
-/*
- * Describes an event registration and stores the results of the registration.
- * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
- * must set the size and name_args before invocation.
- */
-struct user_reg {
-
-	/* Input: Size of the user_reg structure being used */
-	__u32 size;
-
-	/* Input: Pointer to string with event name, description and flags */
-	__u64 name_args;
-
-	/* Output: Byte index of the event within the status page */
-	__u32 status_index;
-
-	/* Output: Index of the event to use when writing data */
-	__u32 write_index;
-};
-
-#define DIAG_IOC_MAGIC '*'
-
-/* Requests to register a user_event */
-#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
-
-/* Requests to delete a user_event */
-#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*)
-
-/* Data type that was passed to the BPF program */
-enum {
-	/* Data resides in kernel space */
-	USER_BPF_DATA_KERNEL,
-
-	/* Data resides in user space */
-	USER_BPF_DATA_USER,
-
-	/* Data is a pointer to a user_bpf_iter structure */
-	USER_BPF_DATA_ITER,
-};
-
-/*
- * Describes an iovec iterator that BPF programs can use to access data for
- * a given user_event write() / writev() call.
- */
-struct user_bpf_iter {
-
-	/* Offset of the data within the first iovec */
-	__u32 iov_offset;
-
-	/* Number of iovec structures */
-	__u32 nr_segs;
-
-	/* Pointer to iovec structures */
-	const struct iovec *iov;
-};
-
-/* Context that BPF programs receive when attached to a user_event */
-struct user_bpf_context {
-
-	/* Data type being passed (see union below) */
-	__u32 data_type;
-
-	/* Length of the data */
-	__u32 data_len;
-
-	/* Pointer to data, varies by data type */
-	union {
-		/* Kernel data (data_type == USER_BPF_DATA_KERNEL) */
-		void *kdata;
-
-		/* User data (data_type == USER_BPF_DATA_USER) */
-		void *udata;
-
-		/* Direct iovec (data_type == USER_BPF_DATA_ITER) */
-		struct user_bpf_iter *iter;
-	};
-};
-
-#endif /* _UAPI_LINUX_USER_EVENTS_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9bb54c0b3b2d..2c43e327a619 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -767,6 +767,7 @@ config USER_EVENTS
 	bool "User trace events"
 	select TRACING
 	select DYNAMIC_EVENTS
+	depends on BROKEN || COMPILE_TEST # API needs to be straighten out
 	help
 	  User trace events are user-defined trace events that
 	  can be used like an existing kernel trace event.  User trace
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 8b3d241a31c2..68d62bfac12f 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -18,7 +18,12 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
 #include <uapi/linux/user_events.h>
+#endif
 #include "trace.h"
 #include "trace_dynevent.h"
 
-- 
cgit v1.2.3


From 66df0fdb5981052f3ad97c9879eda93712bdefc2 Mon Sep 17 00:00:00 2001
From: Haiyue Wang <haiyue.wang@intel.com>
Date: Sun, 3 Apr 2022 19:53:26 +0800
Subject: bpf: Correct the comment for BTF kind bitfield

The commit 8fd886911a6a ("bpf: Add BTF_KIND_FLOAT to uapi") has extended
the BTF kind bitfield from 4 to 5 bits, correct the comment.

Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220403115327.205964-1-haiyue.wang@intel.com
---
 include/uapi/linux/btf.h       | 4 ++--
 tools/include/uapi/linux/btf.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index b0d8fea1951d..a9162a6c0284 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -33,8 +33,8 @@ struct btf_type {
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-30: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union and fwd
 	 */
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index b0d8fea1951d..a9162a6c0284 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -33,8 +33,8 @@ struct btf_type {
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-30: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union and fwd
 	 */
-- 
cgit v1.2.3


From f56b919fa4f1b27c589e71f7d90e9785f9196bf1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 16 Feb 2022 09:39:22 +0100
Subject: linux/fb.h: Spelling s/palette/palette/

Fix a misspelling of "palette" in a comment.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/uapi/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h
index 4c14e8be7267..3a49913d006c 100644
--- a/include/uapi/linux/fb.h
+++ b/include/uapi/linux/fb.h
@@ -182,7 +182,7 @@ struct fb_fix_screeninfo {
  *
  * For pseudocolor: offset and length should be the same for all color
  * components. Offset specifies the position of the least significant bit
- * of the pallette index in a pixel value. Length indicates the number
+ * of the palette index in a pixel value. Length indicates the number
  * of available palette entries (i.e. # of entries = 1 << length).
  */
 struct fb_bitfield {
-- 
cgit v1.2.3


From 1ee375d77bb944321c969b456aa73994566cecf6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 4 Apr 2022 10:54:47 -0700
Subject: net, uapi: remove inclusion of arpa/inet.h

In include/uapi/linux/tipc_config.h, there's a comment that it includes
arpa/inet.h for ntohs; but ntohs is not defined in any UAPI header. For
now, reuse the definitions from include/linux/byteorder/generic.h, since
the various conversion functions do exist in UAPI headers:
include/uapi/linux/byteorder/big_endian.h
include/uapi/linux/byteorder/little_endian.h

We would like to get to the point where we can build UAPI header tests
with -nostdinc, meaning that kernel UAPI headers should not have a
circular dependency on libc headers.

Link: https://android-review.googlesource.com/c/platform/bionic/+/2048127
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 4dfc05651c98..c00adf2fe868 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -43,10 +43,6 @@
 #include <linux/tipc.h>
 #include <asm/byteorder.h>
 
-#ifndef __KERNEL__
-#include <arpa/inet.h> /* for ntohs etc. */
-#endif
-
 /*
  * Configuration
  *
@@ -269,33 +265,33 @@ static inline int TLV_OK(const void *tlv, __u16 space)
 	 */
 
 	return (space >= TLV_SPACE(0)) &&
-		(ntohs(((struct tlv_desc *)tlv)->tlv_len) <= space);
+		(__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_len) <= space);
 }
 
 static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type)
 {
 	return TLV_OK(tlv, space) &&
-		(ntohs(((struct tlv_desc *)tlv)->tlv_type) == exp_type);
+		(__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_type) == exp_type);
 }
 
 static inline int TLV_GET_LEN(struct tlv_desc *tlv)
 {
-	return ntohs(tlv->tlv_len);
+	return __be16_to_cpu(tlv->tlv_len);
 }
 
 static inline void TLV_SET_LEN(struct tlv_desc *tlv, __u16 len)
 {
-	tlv->tlv_len = htons(len);
+	tlv->tlv_len = __cpu_to_be16(len);
 }
 
 static inline int TLV_CHECK_TYPE(struct tlv_desc *tlv,  __u16 type)
 {
-	return (ntohs(tlv->tlv_type) == type);
+	return (__be16_to_cpu(tlv->tlv_type) == type);
 }
 
 static inline void TLV_SET_TYPE(struct tlv_desc *tlv, __u16 type)
 {
-	tlv->tlv_type = htons(type);
+	tlv->tlv_type = __cpu_to_be16(type);
 }
 
 static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
@@ -305,8 +301,8 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
 
 	tlv_len = TLV_LENGTH(len);
 	tlv_ptr = (struct tlv_desc *)tlv;
-	tlv_ptr->tlv_type = htons(type);
-	tlv_ptr->tlv_len  = htons(tlv_len);
+	tlv_ptr->tlv_type = __cpu_to_be16(type);
+	tlv_ptr->tlv_len  = __cpu_to_be16(tlv_len);
 	if (len && data) {
 		memcpy(TLV_DATA(tlv_ptr), data, len);
 		memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len);
@@ -348,7 +344,7 @@ static inline void *TLV_LIST_DATA(struct tlv_list_desc *list)
 
 static inline void TLV_LIST_STEP(struct tlv_list_desc *list)
 {
-	__u16 tlv_space = TLV_ALIGN(ntohs(list->tlv_ptr->tlv_len));
+	__u16 tlv_space = TLV_ALIGN(__be16_to_cpu(list->tlv_ptr->tlv_len));
 
 	list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space);
 	list->tlv_space -= tlv_space;
@@ -404,9 +400,9 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
 
 	msg_len = TCM_LENGTH(data_len);
 	tcm_hdr = (struct tipc_cfg_msg_hdr *)msg;
-	tcm_hdr->tcm_len   = htonl(msg_len);
-	tcm_hdr->tcm_type  = htons(cmd);
-	tcm_hdr->tcm_flags = htons(flags);
+	tcm_hdr->tcm_len   = __cpu_to_be32(msg_len);
+	tcm_hdr->tcm_type  = __cpu_to_be16(cmd);
+	tcm_hdr->tcm_flags = __cpu_to_be16(flags);
 	if (data_len && data) {
 		memcpy(TCM_DATA(msg), data, data_len);
 		memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len);
-- 
cgit v1.2.3


From fce96cf0443083e37455eff8f78fd240c621dae3 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 7 Mar 2022 15:33:53 -0600
Subject: virt: Add SEV-SNP guest driver

The SEV-SNP specification provides the guest a mechanism to communicate
with the PSP without risk from a malicious hypervisor who wishes to
read, alter, drop or replay the messages sent. The driver uses
snp_issue_guest_request() to issue GHCB SNP_GUEST_REQUEST or
SNP_EXT_GUEST_REQUEST NAE events to submit the request to PSP.

The PSP requires that all communication should be encrypted using key
specified through a struct snp_guest_platform_data descriptor.

Userspace can use SNP_GET_REPORT ioctl() to query the guest attestation
report.

See SEV-SNP spec section Guest Messages for more details.

  [ bp: Remove the "what" from the commit message, massage. ]

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220307213356.2797205-44-brijesh.singh@amd.com
---
 Documentation/virt/coco/sevguest.rst  |  86 +++++
 Documentation/virt/index.rst          |   1 +
 drivers/virt/Kconfig                  |   3 +
 drivers/virt/Makefile                 |   1 +
 drivers/virt/coco/sevguest/Kconfig    |  14 +
 drivers/virt/coco/sevguest/Makefile   |   2 +
 drivers/virt/coco/sevguest/sevguest.c | 607 ++++++++++++++++++++++++++++++++++
 drivers/virt/coco/sevguest/sevguest.h |  98 ++++++
 include/uapi/linux/sev-guest.h        |  50 +++
 9 files changed, 862 insertions(+)
 create mode 100644 Documentation/virt/coco/sevguest.rst
 create mode 100644 drivers/virt/coco/sevguest/Kconfig
 create mode 100644 drivers/virt/coco/sevguest/Makefile
 create mode 100644 drivers/virt/coco/sevguest/sevguest.c
 create mode 100644 drivers/virt/coco/sevguest/sevguest.h
 create mode 100644 include/uapi/linux/sev-guest.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/coco/sevguest.rst b/Documentation/virt/coco/sevguest.rst
new file mode 100644
index 000000000000..3da782e867a3
--- /dev/null
+++ b/Documentation/virt/coco/sevguest.rst
@@ -0,0 +1,86 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================================
+The Definitive SEV Guest API Documentation
+===================================================================
+
+1. General description
+======================
+
+The SEV API is a set of ioctls that are used by the guest or hypervisor
+to get or set a certain aspect of the SEV virtual machine. The ioctls belong
+to the following classes:
+
+ - Hypervisor ioctls: These query and set global attributes which affect the
+   whole SEV firmware.  These ioctl are used by platform provisioning tools.
+
+ - Guest ioctls: These query and set attributes of the SEV virtual machine.
+
+2. API description
+==================
+
+This section describes ioctls that is used for querying the SEV guest report
+from the SEV firmware. For each ioctl, the following information is provided
+along with a description:
+
+  Technology:
+      which SEV technology provides this ioctl. SEV, SEV-ES, SEV-SNP or all.
+
+  Type:
+      hypervisor or guest. The ioctl can be used inside the guest or the
+      hypervisor.
+
+  Parameters:
+      what parameters are accepted by the ioctl.
+
+  Returns:
+      the return value.  General error numbers (-ENOMEM, -EINVAL)
+      are not detailed, but errors with specific meanings are.
+
+The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device.
+The ioctl accepts struct snp_user_guest_request. The input and output structure is
+specified through the req_data and resp_data field respectively. If the ioctl fails
+to execute due to a firmware error, then fw_err code will be set otherwise the
+fw_err will be set to 0x00000000000000ff.
+
+The firmware checks that the message sequence counter is one greater than
+the guests message sequence counter. If guest driver fails to increment message
+counter (e.g. counter overflow), then -EIO will be returned.
+
+::
+
+        struct snp_guest_request_ioctl {
+                /* Message version number */
+                __u32 msg_version;
+
+                /* Request and response structure address */
+                __u64 req_data;
+                __u64 resp_data;
+
+                /* firmware error code on failure (see psp-sev.h) */
+                __u64 fw_err;
+        };
+
+2.1 SNP_GET_REPORT
+------------------
+
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in): struct snp_report_req
+:Returns (out): struct snp_report_resp on success, -negative on error
+
+The SNP_GET_REPORT ioctl can be used to query the attestation report from the
+SEV-SNP firmware. The ioctl uses the SNP_GUEST_REQUEST (MSG_REPORT_REQ) command
+provided by the SEV-SNP firmware to query the attestation report.
+
+On success, the snp_report_resp.data will contains the report. The report
+contain the format described in the SEV-SNP specification. See the SEV-SNP
+specification for further details.
+
+
+Reference
+---------
+
+SEV-SNP and GHCB specification: developer.amd.com/sev
+
+The driver is based on SEV-SNP firmware spec 0.9 and GHCB spec version 2.0.
diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst
index edea7fea95a8..40ad0d20032e 100644
--- a/Documentation/virt/index.rst
+++ b/Documentation/virt/index.rst
@@ -13,6 +13,7 @@ Linux Virtualization Support
    guest-halt-polling
    ne_overview
    acrn/index
+   coco/sevguest
 
 .. only:: html and subproject
 
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig
index 121b9293c737..7d3273cfab27 100644
--- a/drivers/virt/Kconfig
+++ b/drivers/virt/Kconfig
@@ -47,4 +47,7 @@ source "drivers/virt/vboxguest/Kconfig"
 source "drivers/virt/nitro_enclaves/Kconfig"
 
 source "drivers/virt/acrn/Kconfig"
+
+source "drivers/virt/coco/sevguest/Kconfig"
+
 endif
diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile
index 108d0ffcc9aa..7b87a7ba1972 100644
--- a/drivers/virt/Makefile
+++ b/drivers/virt/Makefile
@@ -9,3 +9,4 @@ obj-y				+= vboxguest/
 
 obj-$(CONFIG_NITRO_ENCLAVES)	+= nitro_enclaves/
 obj-$(CONFIG_ACRN_HSM)		+= acrn/
+obj-$(CONFIG_SEV_GUEST)		+= coco/sevguest/
diff --git a/drivers/virt/coco/sevguest/Kconfig b/drivers/virt/coco/sevguest/Kconfig
new file mode 100644
index 000000000000..74ca1fe09437
--- /dev/null
+++ b/drivers/virt/coco/sevguest/Kconfig
@@ -0,0 +1,14 @@
+config SEV_GUEST
+	tristate "AMD SEV Guest driver"
+	default m
+	depends on AMD_MEM_ENCRYPT
+	select CRYPTO_AEAD2
+	select CRYPTO_GCM
+	help
+	  SEV-SNP firmware provides the guest a mechanism to communicate with
+	  the PSP without risk from a malicious hypervisor who wishes to read,
+	  alter, drop or replay the messages sent. The driver provides
+	  userspace interface to communicate with the PSP to request the
+	  attestation report and more.
+
+	  If you choose 'M' here, this module will be called sevguest.
diff --git a/drivers/virt/coco/sevguest/Makefile b/drivers/virt/coco/sevguest/Makefile
new file mode 100644
index 000000000000..b1ffb2b4177b
--- /dev/null
+++ b/drivers/virt/coco/sevguest/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SEV_GUEST) += sevguest.o
diff --git a/drivers/virt/coco/sevguest/sevguest.c b/drivers/virt/coco/sevguest/sevguest.c
new file mode 100644
index 000000000000..beda93cdeb4f
--- /dev/null
+++ b/drivers/virt/coco/sevguest/sevguest.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure Encrypted Virtualization Nested Paging (SEV-SNP) guest request interface
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ */
+
+#define pr_fmt(fmt) "SNP: GUEST: " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/set_memory.h>
+#include <linux/fs.h>
+#include <crypto/aead.h>
+#include <linux/scatterlist.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
+#include <uapi/linux/psp-sev.h>
+
+#include <asm/svm.h>
+#include <asm/sev.h>
+
+#include "sevguest.h"
+
+#define DEVICE_NAME	"sev-guest"
+#define AAD_LEN		48
+#define MSG_HDR_VER	1
+
+struct snp_guest_crypto {
+	struct crypto_aead *tfm;
+	u8 *iv, *authtag;
+	int iv_len, a_len;
+};
+
+struct snp_guest_dev {
+	struct device *dev;
+	struct miscdevice misc;
+
+	struct snp_guest_crypto *crypto;
+	struct snp_guest_msg *request, *response;
+	struct snp_secrets_page_layout *layout;
+	struct snp_req_data input;
+	u32 *os_area_msg_seqno;
+	u8 *vmpck;
+};
+
+static u32 vmpck_id;
+module_param(vmpck_id, uint, 0444);
+MODULE_PARM_DESC(vmpck_id, "The VMPCK ID to use when communicating with the PSP.");
+
+/* Mutex to serialize the shared buffer access and command handling. */
+static DEFINE_MUTEX(snp_cmd_mutex);
+
+static bool is_vmpck_empty(struct snp_guest_dev *snp_dev)
+{
+	char zero_key[VMPCK_KEY_LEN] = {0};
+
+	if (snp_dev->vmpck)
+		return !memcmp(snp_dev->vmpck, zero_key, VMPCK_KEY_LEN);
+
+	return true;
+}
+
+static void snp_disable_vmpck(struct snp_guest_dev *snp_dev)
+{
+	memzero_explicit(snp_dev->vmpck, VMPCK_KEY_LEN);
+	snp_dev->vmpck = NULL;
+}
+
+static inline u64 __snp_get_msg_seqno(struct snp_guest_dev *snp_dev)
+{
+	u64 count;
+
+	lockdep_assert_held(&snp_cmd_mutex);
+
+	/* Read the current message sequence counter from secrets pages */
+	count = *snp_dev->os_area_msg_seqno;
+
+	return count + 1;
+}
+
+/* Return a non-zero on success */
+static u64 snp_get_msg_seqno(struct snp_guest_dev *snp_dev)
+{
+	u64 count = __snp_get_msg_seqno(snp_dev);
+
+	/*
+	 * The message sequence counter for the SNP guest request is a  64-bit
+	 * value but the version 2 of GHCB specification defines a 32-bit storage
+	 * for it. If the counter exceeds the 32-bit value then return zero.
+	 * The caller should check the return value, but if the caller happens to
+	 * not check the value and use it, then the firmware treats zero as an
+	 * invalid number and will fail the  message request.
+	 */
+	if (count >= UINT_MAX) {
+		dev_err(snp_dev->dev, "request message sequence counter overflow\n");
+		return 0;
+	}
+
+	return count;
+}
+
+static void snp_inc_msg_seqno(struct snp_guest_dev *snp_dev)
+{
+	/*
+	 * The counter is also incremented by the PSP, so increment it by 2
+	 * and save in secrets page.
+	 */
+	*snp_dev->os_area_msg_seqno += 2;
+}
+
+static inline struct snp_guest_dev *to_snp_dev(struct file *file)
+{
+	struct miscdevice *dev = file->private_data;
+
+	return container_of(dev, struct snp_guest_dev, misc);
+}
+
+static struct snp_guest_crypto *init_crypto(struct snp_guest_dev *snp_dev, u8 *key, size_t keylen)
+{
+	struct snp_guest_crypto *crypto;
+
+	crypto = kzalloc(sizeof(*crypto), GFP_KERNEL_ACCOUNT);
+	if (!crypto)
+		return NULL;
+
+	crypto->tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+	if (IS_ERR(crypto->tfm))
+		goto e_free;
+
+	if (crypto_aead_setkey(crypto->tfm, key, keylen))
+		goto e_free_crypto;
+
+	crypto->iv_len = crypto_aead_ivsize(crypto->tfm);
+	crypto->iv = kmalloc(crypto->iv_len, GFP_KERNEL_ACCOUNT);
+	if (!crypto->iv)
+		goto e_free_crypto;
+
+	if (crypto_aead_authsize(crypto->tfm) > MAX_AUTHTAG_LEN) {
+		if (crypto_aead_setauthsize(crypto->tfm, MAX_AUTHTAG_LEN)) {
+			dev_err(snp_dev->dev, "failed to set authsize to %d\n", MAX_AUTHTAG_LEN);
+			goto e_free_iv;
+		}
+	}
+
+	crypto->a_len = crypto_aead_authsize(crypto->tfm);
+	crypto->authtag = kmalloc(crypto->a_len, GFP_KERNEL_ACCOUNT);
+	if (!crypto->authtag)
+		goto e_free_auth;
+
+	return crypto;
+
+e_free_auth:
+	kfree(crypto->authtag);
+e_free_iv:
+	kfree(crypto->iv);
+e_free_crypto:
+	crypto_free_aead(crypto->tfm);
+e_free:
+	kfree(crypto);
+
+	return NULL;
+}
+
+static void deinit_crypto(struct snp_guest_crypto *crypto)
+{
+	crypto_free_aead(crypto->tfm);
+	kfree(crypto->iv);
+	kfree(crypto->authtag);
+	kfree(crypto);
+}
+
+static int enc_dec_message(struct snp_guest_crypto *crypto, struct snp_guest_msg *msg,
+			   u8 *src_buf, u8 *dst_buf, size_t len, bool enc)
+{
+	struct snp_guest_msg_hdr *hdr = &msg->hdr;
+	struct scatterlist src[3], dst[3];
+	DECLARE_CRYPTO_WAIT(wait);
+	struct aead_request *req;
+	int ret;
+
+	req = aead_request_alloc(crypto->tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	/*
+	 * AEAD memory operations:
+	 * +------ AAD -------+------- DATA -----+---- AUTHTAG----+
+	 * |  msg header      |  plaintext       |  hdr->authtag  |
+	 * | bytes 30h - 5Fh  |    or            |                |
+	 * |                  |   cipher         |                |
+	 * +------------------+------------------+----------------+
+	 */
+	sg_init_table(src, 3);
+	sg_set_buf(&src[0], &hdr->algo, AAD_LEN);
+	sg_set_buf(&src[1], src_buf, hdr->msg_sz);
+	sg_set_buf(&src[2], hdr->authtag, crypto->a_len);
+
+	sg_init_table(dst, 3);
+	sg_set_buf(&dst[0], &hdr->algo, AAD_LEN);
+	sg_set_buf(&dst[1], dst_buf, hdr->msg_sz);
+	sg_set_buf(&dst[2], hdr->authtag, crypto->a_len);
+
+	aead_request_set_ad(req, AAD_LEN);
+	aead_request_set_tfm(req, crypto->tfm);
+	aead_request_set_callback(req, 0, crypto_req_done, &wait);
+
+	aead_request_set_crypt(req, src, dst, len, crypto->iv);
+	ret = crypto_wait_req(enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req), &wait);
+
+	aead_request_free(req);
+	return ret;
+}
+
+static int __enc_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg,
+			 void *plaintext, size_t len)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_guest_msg_hdr *hdr = &msg->hdr;
+
+	memset(crypto->iv, 0, crypto->iv_len);
+	memcpy(crypto->iv, &hdr->msg_seqno, sizeof(hdr->msg_seqno));
+
+	return enc_dec_message(crypto, msg, plaintext, msg->payload, len, true);
+}
+
+static int dec_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg,
+		       void *plaintext, size_t len)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_guest_msg_hdr *hdr = &msg->hdr;
+
+	/* Build IV with response buffer sequence number */
+	memset(crypto->iv, 0, crypto->iv_len);
+	memcpy(crypto->iv, &hdr->msg_seqno, sizeof(hdr->msg_seqno));
+
+	return enc_dec_message(crypto, msg, msg->payload, plaintext, len, false);
+}
+
+static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload, u32 sz)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_guest_msg *resp = snp_dev->response;
+	struct snp_guest_msg *req = snp_dev->request;
+	struct snp_guest_msg_hdr *req_hdr = &req->hdr;
+	struct snp_guest_msg_hdr *resp_hdr = &resp->hdr;
+
+	dev_dbg(snp_dev->dev, "response [seqno %lld type %d version %d sz %d]\n",
+		resp_hdr->msg_seqno, resp_hdr->msg_type, resp_hdr->msg_version, resp_hdr->msg_sz);
+
+	/* Verify that the sequence counter is incremented by 1 */
+	if (unlikely(resp_hdr->msg_seqno != (req_hdr->msg_seqno + 1)))
+		return -EBADMSG;
+
+	/* Verify response message type and version number. */
+	if (resp_hdr->msg_type != (req_hdr->msg_type + 1) ||
+	    resp_hdr->msg_version != req_hdr->msg_version)
+		return -EBADMSG;
+
+	/*
+	 * If the message size is greater than our buffer length then return
+	 * an error.
+	 */
+	if (unlikely((resp_hdr->msg_sz + crypto->a_len) > sz))
+		return -EBADMSG;
+
+	/* Decrypt the payload */
+	return dec_payload(snp_dev, resp, payload, resp_hdr->msg_sz + crypto->a_len);
+}
+
+static bool enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8 type,
+			void *payload, size_t sz)
+{
+	struct snp_guest_msg *req = snp_dev->request;
+	struct snp_guest_msg_hdr *hdr = &req->hdr;
+
+	memset(req, 0, sizeof(*req));
+
+	hdr->algo = SNP_AEAD_AES_256_GCM;
+	hdr->hdr_version = MSG_HDR_VER;
+	hdr->hdr_sz = sizeof(*hdr);
+	hdr->msg_type = type;
+	hdr->msg_version = version;
+	hdr->msg_seqno = seqno;
+	hdr->msg_vmpck = vmpck_id;
+	hdr->msg_sz = sz;
+
+	/* Verify the sequence number is non-zero */
+	if (!hdr->msg_seqno)
+		return -ENOSR;
+
+	dev_dbg(snp_dev->dev, "request [seqno %lld type %d version %d sz %d]\n",
+		hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz);
+
+	return __enc_payload(snp_dev, req, payload, sz);
+}
+
+static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, int msg_ver,
+				u8 type, void *req_buf, size_t req_sz, void *resp_buf,
+				u32 resp_sz, __u64 *fw_err)
+{
+	unsigned long err;
+	u64 seqno;
+	int rc;
+
+	/* Get message sequence and verify that its a non-zero */
+	seqno = snp_get_msg_seqno(snp_dev);
+	if (!seqno)
+		return -EIO;
+
+	memset(snp_dev->response, 0, sizeof(struct snp_guest_msg));
+
+	/* Encrypt the userspace provided payload */
+	rc = enc_payload(snp_dev, seqno, msg_ver, type, req_buf, req_sz);
+	if (rc)
+		return rc;
+
+	/* Call firmware to process the request */
+	rc = snp_issue_guest_request(exit_code, &snp_dev->input, &err);
+	if (fw_err)
+		*fw_err = err;
+
+	if (rc)
+		return rc;
+
+	/*
+	 * The verify_and_dec_payload() will fail only if the hypervisor is
+	 * actively modifying the message header or corrupting the encrypted payload.
+	 * This hints that hypervisor is acting in a bad faith. Disable the VMPCK so that
+	 * the key cannot be used for any communication. The key is disabled to ensure
+	 * that AES-GCM does not use the same IV while encrypting the request payload.
+	 */
+	rc = verify_and_dec_payload(snp_dev, resp_buf, resp_sz);
+	if (rc) {
+		dev_alert(snp_dev->dev,
+			  "Detected unexpected decode failure, disabling the vmpck_id %d\n",
+			  vmpck_id);
+		snp_disable_vmpck(snp_dev);
+		return rc;
+	}
+
+	/* Increment to new message sequence after payload decryption was successful. */
+	snp_inc_msg_seqno(snp_dev);
+
+	return 0;
+}
+
+static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_report_resp *resp;
+	struct snp_report_req req;
+	int rc, resp_len;
+
+	lockdep_assert_held(&snp_cmd_mutex);
+
+	if (!arg->req_data || !arg->resp_data)
+		return -EINVAL;
+
+	if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req)))
+		return -EFAULT;
+
+	/*
+	 * The intermediate response buffer is used while decrypting the
+	 * response payload. Make sure that it has enough space to cover the
+	 * authtag.
+	 */
+	resp_len = sizeof(resp->data) + crypto->a_len;
+	resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT);
+	if (!resp)
+		return -ENOMEM;
+
+	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
+				  SNP_MSG_REPORT_REQ, &req, sizeof(req), resp->data,
+				  resp_len, &arg->fw_err);
+	if (rc)
+		goto e_free;
+
+	if (copy_to_user((void __user *)arg->resp_data, resp, sizeof(*resp)))
+		rc = -EFAULT;
+
+e_free:
+	kfree(resp);
+	return rc;
+}
+
+static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
+{
+	struct snp_guest_dev *snp_dev = to_snp_dev(file);
+	void __user *argp = (void __user *)arg;
+	struct snp_guest_request_ioctl input;
+	int ret = -ENOTTY;
+
+	if (copy_from_user(&input, argp, sizeof(input)))
+		return -EFAULT;
+
+	input.fw_err = 0xff;
+
+	/* Message version must be non-zero */
+	if (!input.msg_version)
+		return -EINVAL;
+
+	mutex_lock(&snp_cmd_mutex);
+
+	/* Check if the VMPCK is not empty */
+	if (is_vmpck_empty(snp_dev)) {
+		dev_err_ratelimited(snp_dev->dev, "VMPCK is disabled\n");
+		mutex_unlock(&snp_cmd_mutex);
+		return -ENOTTY;
+	}
+
+	switch (ioctl) {
+	case SNP_GET_REPORT:
+		ret = get_report(snp_dev, &input);
+		break;
+	default:
+		break;
+	}
+
+	mutex_unlock(&snp_cmd_mutex);
+
+	if (input.fw_err && copy_to_user(argp, &input, sizeof(input)))
+		return -EFAULT;
+
+	return ret;
+}
+
+static void free_shared_pages(void *buf, size_t sz)
+{
+	unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+	int ret;
+
+	if (!buf)
+		return;
+
+	ret = set_memory_encrypted((unsigned long)buf, npages);
+	if (ret) {
+		WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
+		return;
+	}
+
+	__free_pages(virt_to_page(buf), get_order(sz));
+}
+
+static void *alloc_shared_pages(size_t sz)
+{
+	unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+	struct page *page;
+	int ret;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz));
+	if (IS_ERR(page))
+		return NULL;
+
+	ret = set_memory_decrypted((unsigned long)page_address(page), npages);
+	if (ret) {
+		pr_err("failed to mark page shared, ret=%d\n", ret);
+		__free_pages(page, get_order(sz));
+		return NULL;
+	}
+
+	return page_address(page);
+}
+
+static const struct file_operations snp_guest_fops = {
+	.owner	= THIS_MODULE,
+	.unlocked_ioctl = snp_guest_ioctl,
+};
+
+static u8 *get_vmpck(int id, struct snp_secrets_page_layout *layout, u32 **seqno)
+{
+	u8 *key = NULL;
+
+	switch (id) {
+	case 0:
+		*seqno = &layout->os_area.msg_seqno_0;
+		key = layout->vmpck0;
+		break;
+	case 1:
+		*seqno = &layout->os_area.msg_seqno_1;
+		key = layout->vmpck1;
+		break;
+	case 2:
+		*seqno = &layout->os_area.msg_seqno_2;
+		key = layout->vmpck2;
+		break;
+	case 3:
+		*seqno = &layout->os_area.msg_seqno_3;
+		key = layout->vmpck3;
+		break;
+	default:
+		break;
+	}
+
+	return key;
+}
+
+static int __init snp_guest_probe(struct platform_device *pdev)
+{
+	struct snp_secrets_page_layout *layout;
+	struct snp_guest_platform_data *data;
+	struct device *dev = &pdev->dev;
+	struct snp_guest_dev *snp_dev;
+	struct miscdevice *misc;
+	int ret;
+
+	if (!dev->platform_data)
+		return -ENODEV;
+
+	data = (struct snp_guest_platform_data *)dev->platform_data;
+	layout = (__force void *)ioremap_encrypted(data->secrets_gpa, PAGE_SIZE);
+	if (!layout)
+		return -ENODEV;
+
+	ret = -ENOMEM;
+	snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL);
+	if (!snp_dev)
+		goto e_unmap;
+
+	ret = -EINVAL;
+	snp_dev->vmpck = get_vmpck(vmpck_id, layout, &snp_dev->os_area_msg_seqno);
+	if (!snp_dev->vmpck) {
+		dev_err(dev, "invalid vmpck id %d\n", vmpck_id);
+		goto e_unmap;
+	}
+
+	/* Verify that VMPCK is not zero. */
+	if (is_vmpck_empty(snp_dev)) {
+		dev_err(dev, "vmpck id %d is null\n", vmpck_id);
+		goto e_unmap;
+	}
+
+	platform_set_drvdata(pdev, snp_dev);
+	snp_dev->dev = dev;
+	snp_dev->layout = layout;
+
+	/* Allocate the shared page used for the request and response message. */
+	snp_dev->request = alloc_shared_pages(sizeof(struct snp_guest_msg));
+	if (!snp_dev->request)
+		goto e_unmap;
+
+	snp_dev->response = alloc_shared_pages(sizeof(struct snp_guest_msg));
+	if (!snp_dev->response)
+		goto e_free_request;
+
+	ret = -EIO;
+	snp_dev->crypto = init_crypto(snp_dev, snp_dev->vmpck, VMPCK_KEY_LEN);
+	if (!snp_dev->crypto)
+		goto e_free_response;
+
+	misc = &snp_dev->misc;
+	misc->minor = MISC_DYNAMIC_MINOR;
+	misc->name = DEVICE_NAME;
+	misc->fops = &snp_guest_fops;
+
+	/* initial the input address for guest request */
+	snp_dev->input.req_gpa = __pa(snp_dev->request);
+	snp_dev->input.resp_gpa = __pa(snp_dev->response);
+
+	ret =  misc_register(misc);
+	if (ret)
+		goto e_free_response;
+
+	dev_info(dev, "Initialized SNP guest driver (using vmpck_id %d)\n", vmpck_id);
+	return 0;
+
+e_free_response:
+	free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg));
+e_free_request:
+	free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg));
+e_unmap:
+	iounmap(layout);
+	return ret;
+}
+
+static int __exit snp_guest_remove(struct platform_device *pdev)
+{
+	struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev);
+
+	free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg));
+	free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg));
+	deinit_crypto(snp_dev->crypto);
+	misc_deregister(&snp_dev->misc);
+
+	return 0;
+}
+
+static struct platform_driver snp_guest_driver = {
+	.remove		= __exit_p(snp_guest_remove),
+	.driver		= {
+		.name = "snp-guest",
+	},
+};
+
+module_platform_driver_probe(snp_guest_driver, snp_guest_probe);
+
+MODULE_AUTHOR("Brijesh Singh <brijesh.singh@amd.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+MODULE_DESCRIPTION("AMD SNP Guest Driver");
diff --git a/drivers/virt/coco/sevguest/sevguest.h b/drivers/virt/coco/sevguest/sevguest.h
new file mode 100644
index 000000000000..d39bdd013765
--- /dev/null
+++ b/drivers/virt/coco/sevguest/sevguest.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * SEV-SNP API spec is available at https://developer.amd.com/sev
+ */
+
+#ifndef __VIRT_SEVGUEST_H__
+#define __VIRT_SEVGUEST_H__
+
+#include <linux/types.h>
+
+#define MAX_AUTHTAG_LEN		32
+
+/* See SNP spec SNP_GUEST_REQUEST section for the structure */
+enum msg_type {
+	SNP_MSG_TYPE_INVALID = 0,
+	SNP_MSG_CPUID_REQ,
+	SNP_MSG_CPUID_RSP,
+	SNP_MSG_KEY_REQ,
+	SNP_MSG_KEY_RSP,
+	SNP_MSG_REPORT_REQ,
+	SNP_MSG_REPORT_RSP,
+	SNP_MSG_EXPORT_REQ,
+	SNP_MSG_EXPORT_RSP,
+	SNP_MSG_IMPORT_REQ,
+	SNP_MSG_IMPORT_RSP,
+	SNP_MSG_ABSORB_REQ,
+	SNP_MSG_ABSORB_RSP,
+	SNP_MSG_VMRK_REQ,
+	SNP_MSG_VMRK_RSP,
+
+	SNP_MSG_TYPE_MAX
+};
+
+enum aead_algo {
+	SNP_AEAD_INVALID,
+	SNP_AEAD_AES_256_GCM,
+};
+
+struct snp_guest_msg_hdr {
+	u8 authtag[MAX_AUTHTAG_LEN];
+	u64 msg_seqno;
+	u8 rsvd1[8];
+	u8 algo;
+	u8 hdr_version;
+	u16 hdr_sz;
+	u8 msg_type;
+	u8 msg_version;
+	u16 msg_sz;
+	u32 rsvd2;
+	u8 msg_vmpck;
+	u8 rsvd3[35];
+} __packed;
+
+struct snp_guest_msg {
+	struct snp_guest_msg_hdr hdr;
+	u8 payload[4000];
+} __packed;
+
+/*
+ * The secrets page contains 96-bytes of reserved field that can be used by
+ * the guest OS. The guest OS uses the area to save the message sequence
+ * number for each VMPCK.
+ *
+ * See the GHCB spec section Secret page layout for the format for this area.
+ */
+struct secrets_os_area {
+	u32 msg_seqno_0;
+	u32 msg_seqno_1;
+	u32 msg_seqno_2;
+	u32 msg_seqno_3;
+	u64 ap_jump_table_pa;
+	u8 rsvd[40];
+	u8 guest_usage[32];
+} __packed;
+
+#define VMPCK_KEY_LEN		32
+
+/* See the SNP spec version 0.9 for secrets page format */
+struct snp_secrets_page_layout {
+	u32 version;
+	u32 imien	: 1,
+	    rsvd1	: 31;
+	u32 fms;
+	u32 rsvd2;
+	u8 gosvw[16];
+	u8 vmpck0[VMPCK_KEY_LEN];
+	u8 vmpck1[VMPCK_KEY_LEN];
+	u8 vmpck2[VMPCK_KEY_LEN];
+	u8 vmpck3[VMPCK_KEY_LEN];
+	struct secrets_os_area os_area;
+	u8 rsvd3[3840];
+} __packed;
+
+#endif /* __VIRT_SEVGUEST_H__ */
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
new file mode 100644
index 000000000000..38f11d723c68
--- /dev/null
+++ b/include/uapi/linux/sev-guest.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Userspace interface for AMD SEV and SNP guest driver.
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * SEV API specification is available at: https://developer.amd.com/sev/
+ */
+
+#ifndef __UAPI_LINUX_SEV_GUEST_H_
+#define __UAPI_LINUX_SEV_GUEST_H_
+
+#include <linux/types.h>
+
+struct snp_report_req {
+	/* user data that should be included in the report */
+	__u8 user_data[64];
+
+	/* The vmpl level to be included in the report */
+	__u32 vmpl;
+
+	/* Must be zero filled */
+	__u8 rsvd[28];
+};
+
+struct snp_report_resp {
+	/* response data, see SEV-SNP spec for the format */
+	__u8 data[4000];
+};
+
+struct snp_guest_request_ioctl {
+	/* message version number (must be non-zero) */
+	__u8 msg_version;
+
+	/* Request and response structure address */
+	__u64 req_data;
+	__u64 resp_data;
+
+	/* firmware error code on failure (see psp-sev.h) */
+	__u64 fw_err;
+};
+
+#define SNP_GUEST_REQ_IOC_TYPE	'S'
+
+/* Get SNP attestation report */
+#define SNP_GET_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x0, struct snp_guest_request_ioctl)
+
+#endif /* __UAPI_LINUX_SEV_GUEST_H_ */
-- 
cgit v1.2.3


From 68de0b2f938642079c0c853b219bdb88c4dc4d13 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 24 Feb 2022 10:56:23 -0600
Subject: virt: sevguest: Add support to derive key

The SNP_GET_DERIVED_KEY ioctl interface can be used by the SNP guest to
ask the firmware to provide a key derived from a root key. The derived
key may be used by the guest for any purposes it chooses, such as a
sealing key or communicating with the external entities.

See SEV-SNP firmware spec for more information.

  [ bp: No need to memset "req" - it will get overwritten. ]

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Link: https://lore.kernel.org/r/20220307213356.2797205-45-brijesh.singh@amd.com
---
 Documentation/virt/coco/sevguest.rst  | 17 +++++++++++++
 drivers/virt/coco/sevguest/sevguest.c | 45 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/sev-guest.h        | 17 +++++++++++++
 3 files changed, 79 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/coco/sevguest.rst b/Documentation/virt/coco/sevguest.rst
index 3da782e867a3..4135c1431241 100644
--- a/Documentation/virt/coco/sevguest.rst
+++ b/Documentation/virt/coco/sevguest.rst
@@ -77,6 +77,23 @@ On success, the snp_report_resp.data will contains the report. The report
 contain the format described in the SEV-SNP specification. See the SEV-SNP
 specification for further details.
 
+2.2 SNP_GET_DERIVED_KEY
+-----------------------
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in): struct snp_derived_key_req
+:Returns (out): struct snp_derived_key_resp on success, -negative on error
+
+The SNP_GET_DERIVED_KEY ioctl can be used to get a key derive from a root key.
+The derived key can be used by the guest for any purpose, such as sealing keys
+or communicating with external entities.
+
+The ioctl uses the SNP_GUEST_REQUEST (MSG_KEY_REQ) command provided by the
+SEV-SNP firmware to derive the key. See SEV-SNP specification for further details
+on the various fields passed in the key derivation request.
+
+On success, the snp_derived_key_resp.data contains the derived key value. See
+the SEV-SNP specification for further details.
 
 Reference
 ---------
diff --git a/drivers/virt/coco/sevguest/sevguest.c b/drivers/virt/coco/sevguest/sevguest.c
index beda93cdeb4f..393777b72e5e 100644
--- a/drivers/virt/coco/sevguest/sevguest.c
+++ b/drivers/virt/coco/sevguest/sevguest.c
@@ -391,6 +391,48 @@ e_free:
 	return rc;
 }
 
+static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_derived_key_resp resp = {0};
+	struct snp_derived_key_req req;
+	int rc, resp_len;
+	/* Response data is 64 bytes and max authsize for GCM is 16 bytes. */
+	u8 buf[64 + 16];
+
+	lockdep_assert_held(&snp_cmd_mutex);
+
+	if (!arg->req_data || !arg->resp_data)
+		return -EINVAL;
+
+	/*
+	 * The intermediate response buffer is used while decrypting the
+	 * response payload. Make sure that it has enough space to cover the
+	 * authtag.
+	 */
+	resp_len = sizeof(resp.data) + crypto->a_len;
+	if (sizeof(buf) < resp_len)
+		return -ENOMEM;
+
+	if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req)))
+		return -EFAULT;
+
+	rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
+				  SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len,
+				  &arg->fw_err);
+	if (rc)
+		return rc;
+
+	memcpy(resp.data, buf, sizeof(resp.data));
+	if (copy_to_user((void __user *)arg->resp_data, &resp, sizeof(resp)))
+		rc = -EFAULT;
+
+	/* The response buffer contains the sensitive data, explicitly clear it. */
+	memzero_explicit(buf, sizeof(buf));
+	memzero_explicit(&resp, sizeof(resp));
+	return rc;
+}
+
 static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	struct snp_guest_dev *snp_dev = to_snp_dev(file);
@@ -420,6 +462,9 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
 	case SNP_GET_REPORT:
 		ret = get_report(snp_dev, &input);
 		break;
+	case SNP_GET_DERIVED_KEY:
+		ret = get_derived_key(snp_dev, &input);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 38f11d723c68..598367f12064 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -30,6 +30,20 @@ struct snp_report_resp {
 	__u8 data[4000];
 };
 
+struct snp_derived_key_req {
+	__u32 root_key_select;
+	__u32 rsvd;
+	__u64 guest_field_select;
+	__u32 vmpl;
+	__u32 guest_svn;
+	__u64 tcb_version;
+};
+
+struct snp_derived_key_resp {
+	/* response data, see SEV-SNP spec for the format */
+	__u8 data[64];
+};
+
 struct snp_guest_request_ioctl {
 	/* message version number (must be non-zero) */
 	__u8 msg_version;
@@ -47,4 +61,7 @@ struct snp_guest_request_ioctl {
 /* Get SNP attestation report */
 #define SNP_GET_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x0, struct snp_guest_request_ioctl)
 
+/* Get a derived key from the root */
+#define SNP_GET_DERIVED_KEY _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x1, struct snp_guest_request_ioctl)
+
 #endif /* __UAPI_LINUX_SEV_GUEST_H_ */
-- 
cgit v1.2.3


From d80b494f712317493d464a55652698c4d1b7bb0f Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 7 Mar 2022 15:33:55 -0600
Subject: virt: sevguest: Add support to get extended report

Version 2 of GHCB specification defines Non-Automatic-Exit (NAE) to get
extended guest report which is similar to the SNP_GET_REPORT ioctl. The
main difference is related to the additional data that will be returned.

That additional data returned is a certificate blob that can be used by
the SNP guest user. The certificate blob layout is defined in the GHCB
specification. The driver simply treats the blob as a opaque data and
copies it to userspace.

  [ bp: Massage commit message, cast 1st arg of access_ok() ]

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220307213356.2797205-46-brijesh.singh@amd.com
---
 Documentation/virt/coco/sevguest.rst  | 23 +++++++++
 drivers/virt/coco/sevguest/sevguest.c | 92 ++++++++++++++++++++++++++++++++++-
 include/uapi/linux/sev-guest.h        | 13 +++++
 3 files changed, 126 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/coco/sevguest.rst b/Documentation/virt/coco/sevguest.rst
index 4135c1431241..625de22658ec 100644
--- a/Documentation/virt/coco/sevguest.rst
+++ b/Documentation/virt/coco/sevguest.rst
@@ -95,6 +95,29 @@ on the various fields passed in the key derivation request.
 On success, the snp_derived_key_resp.data contains the derived key value. See
 the SEV-SNP specification for further details.
 
+
+2.3 SNP_GET_EXT_REPORT
+----------------------
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in/out): struct snp_ext_report_req
+:Returns (out): struct snp_report_resp on success, -negative on error
+
+The SNP_GET_EXT_REPORT ioctl is similar to the SNP_GET_REPORT. The difference is
+related to the additional certificate data that is returned with the report.
+The certificate data returned is being provided by the hypervisor through the
+SNP_SET_EXT_CONFIG.
+
+The ioctl uses the SNP_GUEST_REQUEST (MSG_REPORT_REQ) command provided by the SEV-SNP
+firmware to get the attestation report.
+
+On success, the snp_ext_report_resp.data will contain the attestation report
+and snp_ext_report_req.certs_address will contain the certificate blob. If the
+length of the blob is smaller than expected then snp_ext_report_req.certs_len will
+be updated with the expected value.
+
+See GHCB specification for further detail on how to parse the certificate blob.
+
 Reference
 ---------
 
diff --git a/drivers/virt/coco/sevguest/sevguest.c b/drivers/virt/coco/sevguest/sevguest.c
index 393777b72e5e..15afb6ce8d19 100644
--- a/drivers/virt/coco/sevguest/sevguest.c
+++ b/drivers/virt/coco/sevguest/sevguest.c
@@ -43,6 +43,7 @@ struct snp_guest_dev {
 	struct device *dev;
 	struct miscdevice misc;
 
+	void *certs_data;
 	struct snp_guest_crypto *crypto;
 	struct snp_guest_msg *request, *response;
 	struct snp_secrets_page_layout *layout;
@@ -433,6 +434,82 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque
 	return rc;
 }
 
+static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg)
+{
+	struct snp_guest_crypto *crypto = snp_dev->crypto;
+	struct snp_ext_report_req req;
+	struct snp_report_resp *resp;
+	int ret, npages = 0, resp_len;
+
+	lockdep_assert_held(&snp_cmd_mutex);
+
+	if (!arg->req_data || !arg->resp_data)
+		return -EINVAL;
+
+	if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req)))
+		return -EFAULT;
+
+	/* userspace does not want certificate data */
+	if (!req.certs_len || !req.certs_address)
+		goto cmd;
+
+	if (req.certs_len > SEV_FW_BLOB_MAX_SIZE ||
+	    !IS_ALIGNED(req.certs_len, PAGE_SIZE))
+		return -EINVAL;
+
+	if (!access_ok((const void __user *)req.certs_address, req.certs_len))
+		return -EFAULT;
+
+	/*
+	 * Initialize the intermediate buffer with all zeros. This buffer
+	 * is used in the guest request message to get the certs blob from
+	 * the host. If host does not supply any certs in it, then copy
+	 * zeros to indicate that certificate data was not provided.
+	 */
+	memset(snp_dev->certs_data, 0, req.certs_len);
+	npages = req.certs_len >> PAGE_SHIFT;
+cmd:
+	/*
+	 * The intermediate response buffer is used while decrypting the
+	 * response payload. Make sure that it has enough space to cover the
+	 * authtag.
+	 */
+	resp_len = sizeof(resp->data) + crypto->a_len;
+	resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT);
+	if (!resp)
+		return -ENOMEM;
+
+	snp_dev->input.data_npages = npages;
+	ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg->msg_version,
+				   SNP_MSG_REPORT_REQ, &req.data,
+				   sizeof(req.data), resp->data, resp_len, &arg->fw_err);
+
+	/* If certs length is invalid then copy the returned length */
+	if (arg->fw_err == SNP_GUEST_REQ_INVALID_LEN) {
+		req.certs_len = snp_dev->input.data_npages << PAGE_SHIFT;
+
+		if (copy_to_user((void __user *)arg->req_data, &req, sizeof(req)))
+			ret = -EFAULT;
+	}
+
+	if (ret)
+		goto e_free;
+
+	if (npages &&
+	    copy_to_user((void __user *)req.certs_address, snp_dev->certs_data,
+			 req.certs_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	if (copy_to_user((void __user *)arg->resp_data, resp, sizeof(*resp)))
+		ret = -EFAULT;
+
+e_free:
+	kfree(resp);
+	return ret;
+}
+
 static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	struct snp_guest_dev *snp_dev = to_snp_dev(file);
@@ -465,6 +542,9 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
 	case SNP_GET_DERIVED_KEY:
 		ret = get_derived_key(snp_dev, &input);
 		break;
+	case SNP_GET_EXT_REPORT:
+		ret = get_ext_report(snp_dev, &input);
+		break;
 	default:
 		break;
 	}
@@ -595,10 +675,14 @@ static int __init snp_guest_probe(struct platform_device *pdev)
 	if (!snp_dev->response)
 		goto e_free_request;
 
+	snp_dev->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE);
+	if (!snp_dev->certs_data)
+		goto e_free_response;
+
 	ret = -EIO;
 	snp_dev->crypto = init_crypto(snp_dev, snp_dev->vmpck, VMPCK_KEY_LEN);
 	if (!snp_dev->crypto)
-		goto e_free_response;
+		goto e_free_cert_data;
 
 	misc = &snp_dev->misc;
 	misc->minor = MISC_DYNAMIC_MINOR;
@@ -608,14 +692,17 @@ static int __init snp_guest_probe(struct platform_device *pdev)
 	/* initial the input address for guest request */
 	snp_dev->input.req_gpa = __pa(snp_dev->request);
 	snp_dev->input.resp_gpa = __pa(snp_dev->response);
+	snp_dev->input.data_gpa = __pa(snp_dev->certs_data);
 
 	ret =  misc_register(misc);
 	if (ret)
-		goto e_free_response;
+		goto e_free_cert_data;
 
 	dev_info(dev, "Initialized SNP guest driver (using vmpck_id %d)\n", vmpck_id);
 	return 0;
 
+e_free_cert_data:
+	free_shared_pages(snp_dev->certs_data, SEV_FW_BLOB_MAX_SIZE);
 e_free_response:
 	free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg));
 e_free_request:
@@ -629,6 +716,7 @@ static int __exit snp_guest_remove(struct platform_device *pdev)
 {
 	struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev);
 
+	free_shared_pages(snp_dev->certs_data, SEV_FW_BLOB_MAX_SIZE);
 	free_shared_pages(snp_dev->response, sizeof(struct snp_guest_msg));
 	free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg));
 	deinit_crypto(snp_dev->crypto);
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 598367f12064..256aaeff7e65 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -56,6 +56,16 @@ struct snp_guest_request_ioctl {
 	__u64 fw_err;
 };
 
+struct snp_ext_report_req {
+	struct snp_report_req data;
+
+	/* where to copy the certificate blob */
+	__u64 certs_address;
+
+	/* length of the certificate blob */
+	__u32 certs_len;
+};
+
 #define SNP_GUEST_REQ_IOC_TYPE	'S'
 
 /* Get SNP attestation report */
@@ -64,4 +74,7 @@ struct snp_guest_request_ioctl {
 /* Get a derived key from the root */
 #define SNP_GET_DERIVED_KEY _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x1, struct snp_guest_request_ioctl)
 
+/* Get SNP extended report as defined in the GHCB specification version 2. */
+#define SNP_GET_EXT_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x2, struct snp_guest_request_ioctl)
+
 #endif /* __UAPI_LINUX_SEV_GUEST_H_ */
-- 
cgit v1.2.3


From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 6 Apr 2022 17:26:00 +0000
Subject: net-core: rx_otherhost_dropped to core_stats

Increment rx_otherhost_dropped counter when packet dropped due to
mismatched dest MAC addr.

An example when this drop can occur is when manually crafting raw
packets that will be consumed by a user space application via a tap
device. For testing purposes local traffic was generated using trafgen
for the client and netcat to start a server

Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other
with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the
counter was incremented. (Also had to modify iproute2 to show the stat,
additional patch for that coming next.)

Signed-off-by: Jeffrey Ji <jeffreyji@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h    | 2 ++
 include/uapi/linux/if_link.h | 5 +++++
 net/core/dev.c               | 1 +
 net/ipv4/ip_input.c          | 1 +
 net/ipv6/ip6_input.c         | 1 +
 5 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e7b2a72e473..28ea4f8269d4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -203,6 +203,7 @@ struct net_device_core_stats {
 	local_t		rx_dropped;
 	local_t		tx_dropped;
 	local_t		rx_nohandler;
+	local_t		rx_otherhost_dropped;
 } __aligned(4 * sizeof(local_t));
 
 #include <linux/cache.h>
@@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
 DEV_CORE_STATS_INC(rx_nohandler)
+DEV_CORE_STATS_INC(rx_otherhost_dropped)
 
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cc284c048e69..d1e600816b82 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -211,6 +211,9 @@ struct rtnl_link_stats {
  * @rx_nohandler: Number of packets received on the interface
  *   but dropped by the networking stack because the device is
  *   not designated to receive packets (e.g. backup link in a bond).
+ *
+ * @rx_otherhost_dropped: Number of packets dropped due to mismatch
+ *   in destination MAC address.
  */
 struct rtnl_link_stats64 {
 	__u64	rx_packets;
@@ -243,6 +246,8 @@ struct rtnl_link_stats64 {
 	__u64	rx_compressed;
 	__u64	tx_compressed;
 	__u64	rx_nohandler;
+
+	__u64	rx_otherhost_dropped;
 };
 
 /* Subset of link stats useful for in-HW collection. Meaning of the fields is as
diff --git a/net/core/dev.c b/net/core/dev.c
index f00d29856b43..e027410e861b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10358,6 +10358,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 			storage->rx_dropped += local_read(&core_stats->rx_dropped);
 			storage->tx_dropped += local_read(&core_stats->tx_dropped);
 			storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
+			storage->rx_otherhost_dropped += local_read(&core_stats->rx_otherhost_dropped);
 		}
 	}
 	return storage;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 95f7bb052784..b1165f717cd1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -451,6 +451,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 	 * that it receives, do not try to analyse it.
 	 */
 	if (skb->pkt_type == PACKET_OTHERHOST) {
+		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
 		drop_reason = SKB_DROP_REASON_OTHERHOST;
 		goto drop;
 	}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 5b5ea35635f9..b4880c7c84eb 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -150,6 +150,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
 	struct inet6_dev *idev;
 
 	if (skb->pkt_type == PACKET_OTHERHOST) {
+		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
 		kfree_skb(skb);
 		return NULL;
 	}
-- 
cgit v1.2.3


From c4212f3eb89fd5654f0a6ed2ee1d13fcb86cb664 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 10 Apr 2022 15:13:24 -0600
Subject: io_uring: flag the fact that linked file assignment is sane

Give applications a way to tell if the kernel supports sane linked files,
as in files being assigned at the right time to be able to reliably
do <open file direct into slot X><read file from slot X> while using
IOSQE_IO_LINK to order them.

Not really a bug fix, but flag it as such so that it gets pulled in with
backports of the deferred file assignment.

Fixes: 6bf9c47a3989 ("io_uring: defer file assignment")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 3 ++-
 include/uapi/linux/io_uring.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 659f8ecba5b7..f060ad018ba4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -11178,7 +11178,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
-			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
+			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
+			IORING_FEAT_LINKED_FILE;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 784adc6f6ed2..1845cf7c80ba 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -296,6 +296,7 @@ struct io_uring_params {
 #define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
 #define IORING_FEAT_RSRC_TAGS		(1U << 10)
 #define IORING_FEAT_CQE_SKIP		(1U << 11)
+#define IORING_FEAT_LINKED_FILE		(1U << 12)
 
 /*
  * io_uring_register(2) opcodes and arguments
-- 
cgit v1.2.3


From 2d7991fe867974a8e5065ee9691451a406b9320d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 11 Mar 2022 16:23:22 -0700
Subject: dmaengine: idxd: update IAA definitions for user header

Add additional structure definitions for Intel In-memory Analytics
Accelerator (IAA/IAX). See specification (1) for more details.

1: https://cdrdv2.intel.com/v1/dl/getContent/721858

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/164704100212.1373038.18362680016033557757.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index a8f0ff75c430..bce7c43657d5 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -53,6 +53,11 @@ enum idxd_scmd_stat {
 
 /* IAX */
 #define IDXD_OP_FLAG_RD_SRC2_AECS	0x010000
+#define IDXD_OP_FLAG_RD_SRC2_2ND	0x020000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_COMP	0x040000
+#define IDXD_OP_FLAG_WR_SRC2_AECS_OVFL	0x080000
+#define IDXD_OP_FLAG_SRC2_STS		0x100000
+#define IDXD_OP_FLAG_CRC_RFC3720	0x200000
 
 /* Opcode */
 enum dsa_opcode {
@@ -81,6 +86,18 @@ enum iax_opcode {
 	IAX_OPCODE_MEMMOVE,
 	IAX_OPCODE_DECOMPRESS = 0x42,
 	IAX_OPCODE_COMPRESS,
+	IAX_OPCODE_CRC64,
+	IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
+	IAX_OPCODE_ZERO_DECOMP_16,
+	IAX_OPCODE_DECOMP_32 = 0x4c,
+	IAX_OPCODE_DECOMP_16,
+	IAX_OPCODE_SCAN = 0x50,
+	IAX_OPCODE_SET_MEMBER,
+	IAX_OPCODE_EXTRACT,
+	IAX_OPCODE_SELECT,
+	IAX_OPCODE_RLE_BURST,
+	IAX_OPCDE_FIND_UNIQUE,
+	IAX_OPCODE_EXPAND,
 };
 
 /* Completion record status */
@@ -120,6 +137,7 @@ enum iax_completion_status {
 	IAX_COMP_NONE = 0,
 	IAX_COMP_SUCCESS,
 	IAX_COMP_PAGE_FAULT_IR = 0x04,
+	IAX_COMP_ANALYTICS_ERROR = 0x0a,
 	IAX_COMP_OUTBUF_OVERFLOW,
 	IAX_COMP_BAD_OPCODE = 0x10,
 	IAX_COMP_INVALID_FLAGS,
@@ -140,7 +158,10 @@ enum iax_completion_status {
 	IAX_COMP_WATCHDOG,
 	IAX_COMP_INVALID_COMP_FLAG = 0x30,
 	IAX_COMP_INVALID_FILTER_FLAG,
-	IAX_COMP_INVALID_NUM_ELEMS = 0x33,
+	IAX_COMP_INVALID_INPUT_SIZE,
+	IAX_COMP_INVALID_NUM_ELEMS,
+	IAX_COMP_INVALID_SRC1_WIDTH,
+	IAX_COMP_INVALID_INVERT_OUT,
 };
 
 #define DSA_COMP_STATUS_MASK		0x7f
@@ -319,8 +340,12 @@ struct iax_completion_record {
 	uint32_t                output_size;
 	uint8_t                 output_bits;
 	uint8_t                 rsvd3;
-	uint16_t                rsvd4;
-	uint64_t                rsvd5[4];
+	uint16_t                xor_csum;
+	uint32_t                crc;
+	uint32_t                min;
+	uint32_t                max;
+	uint32_t                sum;
+	uint64_t                rsvd4[2];
 } __attribute__((packed));
 
 struct iax_raw_completion_record {
-- 
cgit v1.2.3


From 545528d788556c724eeb5400757f828ef27782a8 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:51:54 +0300
Subject: net: netlink: add NLM_F_BULK delete request modifier

Add a new delete request modifier called NLM_F_BULK which, when
supported, would cause the request to delete multiple objects. The flag
is a convenient way to signal that a multiple delete operation is
requested which can be gradually added to different delete requests. In
order to make sure older kernels will error out if the operation is not
supported instead of doing something unintended we have to break a
required condition when implementing support for this flag, f.e. for
neighbors we will omit the mandatory mac address attribute.
Initially it will be used to add flush with filtering support for bridge
fdbs, but it also opens the door to add similar support to others.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 4c0cde075c27..855dffb4c1c3 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -72,6 +72,7 @@ struct nlmsghdr {
 
 /* Modifiers to DELETE request */
 #define NLM_F_NONREC	0x100	/* Do not delete recursively	*/
+#define NLM_F_BULK	0x200	/* Delete multiple objects	*/
 
 /* Flags for ACK message */
 #define NLM_F_CAPPED	0x100	/* request was capped */
-- 
cgit v1.2.3


From ea2c0f9e3fc2f94f090d693b7235c02af1289629 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:52:00 +0300
Subject: net: rtnetlink: add ndm flags and state mask attributes

Add ndm flags/state masks which will be used for bulk delete filtering.
All of these are used by the bridge and vxlan drivers. Also minimal attr
policy validation is added, it is up to ndo_fdb_del_bulk implementers to
further validate them.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/neighbour.h | 2 ++
 net/core/rtnetlink.c           | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index db05fb55055e..39c565e460c7 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -32,6 +32,8 @@ enum {
 	NDA_NH_ID,
 	NDA_FDB_EXT_ATTRS,
 	NDA_FLAGS_EXT,
+	NDA_NDM_STATE_MASK,
+	NDA_NDM_FLAGS_MASK,
 	__NDA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 520d50fcaaea..ab7fb9a16da9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4172,6 +4172,8 @@ EXPORT_SYMBOL(ndo_dflt_fdb_del);
 static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
 	[NDA_VLAN]	= { .type = NLA_U16 },
 	[NDA_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
+	[NDA_NDM_STATE_MASK]	= { .type = NLA_U16  },
+	[NDA_NDM_FLAGS_MASK]	= { .type = NLA_U8 },
 };
 
 static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
-- 
cgit v1.2.3


From c24a950ec7d60c4da91dc3f273295c7f438b531e Mon Sep 17 00:00:00 2001
From: Peter Gonda <pgonda@google.com>
Date: Thu, 7 Apr 2022 14:02:33 -0700
Subject: KVM, SEV: Add KVM_EXIT_SHUTDOWN metadata for SEV-ES

If an SEV-ES guest requests termination, exit to userspace with
KVM_EXIT_SYSTEM_EVENT and a dedicated SEV_TERM type instead of -EINVAL
so that userspace can take appropriate action.

See AMD's GHCB spec section '4.1.13 Termination Request' for more details.

Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Peter Gonda <pgonda@google.com>

Reported-by: kernel test robot <lkp@intel.com>
Message-Id: <20220407210233.782250-1-pgonda@google.com>
[Add documentatino. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 12 +++++++++++-
 arch/x86/kvm/svm/sev.c         |  9 +++++++--
 include/uapi/linux/kvm.h       |  5 ++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index e7a0dfdc0178..72183ae628f7 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6088,8 +6088,12 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_SHUTDOWN       1
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
+  #define KVM_SYSTEM_EVENT_SEV_TERM       4
+  #define KVM_SYSTEM_EVENT_NDATA_VALID    (1u << 31)
 			__u32 type;
+                        __u32 ndata;
 			__u64 flags;
+                        __u64 data[16];
 		} system_event;
 
 If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
@@ -6099,7 +6103,7 @@ HVC instruction based PSCI call from the vcpu. The 'type' field describes
 the system-level event type. The 'flags' field describes architecture
 specific flags for the system-level event.
 
-Valid values for 'type' are:
+Valid values for bits 30:0 of 'type' are:
 
  - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
    VM. Userspace is not obliged to honour this, and if it does honour
@@ -6112,12 +6116,18 @@ Valid values for 'type' are:
    has requested a crash condition maintenance. Userspace can choose
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
+ - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
+   The guest physical address of the guest's GHCB is stored in `data[0]`.
 
 Valid flags are:
 
  - KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
    a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
 
+Extra data for this event is stored in the `data[]` array, up to index
+`ndata-1` included, if bit 31 is set in `type`.  The data depends on the
+`type` field.  There is no extra data if bit 31 is clear or `ndata` is zero.
+
 ::
 
 		/* KVM_EXIT_IOAPIC_EOI */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 537aaddc852f..a93f0d01bb90 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2738,8 +2738,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 		pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
 			reason_set, reason_code);
 
-		ret = -EINVAL;
-		break;
+		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM |
+					       KVM_SYSTEM_EVENT_NDATA_VALID;
+		vcpu->run->system_event.ndata = 1;
+		vcpu->run->system_event.data[1] = control->ghcb_gpa;
+
+		return 0;
 	}
 	default:
 		/* Error, keep GHCB MSR value as-is */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8616af85dc5d..dd1d8167e71f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -444,8 +444,11 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
+#define KVM_SYSTEM_EVENT_SEV_TERM       4
+#define KVM_SYSTEM_EVENT_NDATA_VALID    (1u << 31)
 			__u32 type;
-			__u64 flags;
+			__u32 ndata;
+			__u64 data[16];
 		} system_event;
 		/* KVM_EXIT_S390_STSI */
 		struct {
-- 
cgit v1.2.3


From 4dc84c06a343fcb95fd5a0acb537aefa4ebdd1b0 Mon Sep 17 00:00:00 2001
From: Jie Wang <wangjie125@huawei.com>
Date: Tue, 12 Apr 2022 10:01:19 +0800
Subject: net: ethtool: extend ringparam set/get APIs for tx_push

Currently tx push is a standard driver feature which controls use of a fast
path descriptor push. So this patch extends the ringparam APIs and data
structures to support set/get tx push by ethtool -G/g.

Signed-off-by: Jie Wang <wangjie125@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst |  8 ++++++++
 include/linux/ethtool.h                      |  4 ++++
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/netlink.h                        |  2 +-
 net/ethtool/rings.c                          | 18 ++++++++++++++++--
 5 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 24d9be69065d..dbca3e9ec782 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -862,6 +862,7 @@ Kernel response contents:
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
   ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
   ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
+  ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
   ====================================  ======  ===========================
 
 ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
@@ -871,6 +872,12 @@ separate buffers. The device configuration must make it possible to receive
 full memory pages of data, for example because MTU is high enough or through
 HW-GRO.
 
+``ETHTOOL_A_RINGS_TX_PUSH`` flag is used to enable descriptor fast
+path to send packets. In ordinary path, driver fills descriptors in DRAM and
+notifies NIC hardware. In fast path, driver pushes descriptors to the device
+through MMIO writes, thus reducing the latency. However, enabling this feature
+may increase the CPU cost. Drivers may enforce additional per-packet
+eligibility checks (e.g. on packet size).
 
 RINGS_SET
 =========
@@ -887,6 +894,7 @@ Request contents:
   ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
   ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
   ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
+  ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
   ====================================  ======  ===========================
 
 Kernel checks that requested ring sizes do not exceed limits reported by
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4af58459a1e7..99dc7bfbcd3c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -71,11 +71,13 @@ enum {
  * struct kernel_ethtool_ringparam - RX/TX ring configuration
  * @rx_buf_len: Current length of buffers on the rx ring.
  * @tcp_data_split: Scatter packet headers and data to separate buffers
+ * @tx_push: The flag of tx push mode
  * @cqe_size: Size of TX/RX completion queue event
  */
 struct kernel_ethtool_ringparam {
 	u32	rx_buf_len;
 	u8	tcp_data_split;
+	u8	tx_push;
 	u32	cqe_size;
 };
 
@@ -83,10 +85,12 @@ struct kernel_ethtool_ringparam {
  * enum ethtool_supported_ring_param - indicator caps for setting ring params
  * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len
  * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size
+ * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push
  */
 enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0),
 	ETHTOOL_RING_USE_CQE_SIZE   = BIT(1),
+	ETHTOOL_RING_USE_TX_PUSH    = BIT(2),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 979850221b8d..d2fb4f7be61b 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -338,6 +338,7 @@ enum {
 	ETHTOOL_A_RINGS_RX_BUF_LEN,                     /* u32 */
 	ETHTOOL_A_RINGS_TCP_DATA_SPLIT,			/* u8 */
 	ETHTOOL_A_RINGS_CQE_SIZE,			/* u32 */
+	ETHTOOL_A_RINGS_TX_PUSH,			/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_RINGS_CNT,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 29d01662a48b..7919ddb2371c 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -363,7 +363,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT
 extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
 extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
 extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
-extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_CQE_SIZE + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index 9f33c9689b56..9ed60c507d97 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -55,7 +55,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_TX */
 	       nla_total_size(sizeof(u32)) +	/* _RINGS_RX_BUF_LEN */
 	       nla_total_size(sizeof(u8))  +	/* _RINGS_TCP_DATA_SPLIT */
-	       nla_total_size(sizeof(u32));	/* _RINGS_CQE_SIZE */
+	       nla_total_size(sizeof(u32)  +	/* _RINGS_CQE_SIZE */
+	       nla_total_size(sizeof(u8)));	/* _RINGS_TX_PUSH */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -94,7 +95,8 @@ static int rings_fill_reply(struct sk_buff *skb,
 	     (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
 			 kr->tcp_data_split))) ||
 	    (kr->cqe_size &&
-	     (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))))
+	     (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) ||
+	    nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push))
 		return -EMSGSIZE;
 
 	return 0;
@@ -123,6 +125,7 @@ const struct nla_policy ethnl_rings_set_policy[] = {
 	[ETHTOOL_A_RINGS_TX]			= { .type = NLA_U32 },
 	[ETHTOOL_A_RINGS_RX_BUF_LEN]            = NLA_POLICY_MIN(NLA_U32, 1),
 	[ETHTOOL_A_RINGS_CQE_SIZE]		= NLA_POLICY_MIN(NLA_U32, 1),
+	[ETHTOOL_A_RINGS_TX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
@@ -149,6 +152,15 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
 	if (!ops->get_ringparam || !ops->set_ringparam)
 		goto out_dev;
 
+	if (tb[ETHTOOL_A_RINGS_TX_PUSH] &&
+	    !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) {
+		ret = -EOPNOTSUPP;
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_RINGS_TX_PUSH],
+				    "setting tx push not supported");
+		goto out_dev;
+	}
+
 	rtnl_lock();
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
@@ -165,6 +177,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
 			 tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod);
 	ethnl_update_u32(&kernel_ringparam.cqe_size,
 			 tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod);
+	ethnl_update_u8(&kernel_ringparam.tx_push,
+			tb[ETHTOOL_A_RINGS_TX_PUSH], &mod);
 	ret = 0;
 	if (!mod)
 		goto out_ops;
-- 
cgit v1.2.3


From f9a2fb73318eb4dbf8cd84866b8b0dd012d8b116 Mon Sep 17 00:00:00 2001
From: Arun Ajith S <aajith@arista.com>
Date: Fri, 15 Apr 2022 08:34:02 +0000
Subject: net/ipv6: Introduce accept_unsolicited_na knob to implement
 router-side changes for RFC9131

Add a new neighbour cache entry in STALE state for routers on receiving
an unsolicited (gratuitous) neighbour advertisement with
target link-layer-address option specified.
This is similar to the arp_accept configuration for IPv4.
A new sysctl endpoint is created to turn on this behaviour:
/proc/sys/net/ipv6/conf/interface/accept_unsolicited_na.

Signed-off-by: Arun Ajith S <aajith@arista.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst             |  27 +++
 include/linux/ipv6.h                               |   1 +
 include/uapi/linux/ipv6.h                          |   1 +
 net/ipv6/addrconf.c                                |  10 +
 net/ipv6/ndisc.c                                   |  20 +-
 tools/testing/selftests/net/Makefile               |   1 +
 .../selftests/net/ndisc_unsolicited_na_test.sh     | 255 +++++++++++++++++++++
 7 files changed, 314 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/ndisc_unsolicited_na_test.sh

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index b0024aa7b051..433f2e4a5fed 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2467,6 +2467,33 @@ drop_unsolicited_na - BOOLEAN
 
 	By default this is turned off.
 
+accept_unsolicited_na - BOOLEAN
+	Add a new neighbour cache entry in STALE state for routers on receiving an
+	unsolicited neighbour advertisement with target link-layer address option
+	specified. This is as per router-side behavior documented in RFC9131.
+	This has lower precedence than drop_unsolicited_na.
+
+	 ====   ======  ======  ==============================================
+	 drop   accept  fwding                   behaviour
+	 ----   ------  ------  ----------------------------------------------
+	    1        X       X  Drop NA packet and don't pass up the stack
+	    0        0       X  Pass NA packet up the stack, don't update NC
+	    0        1       0  Pass NA packet up the stack, don't update NC
+	    0        1       1  Pass NA packet up the stack, and add a STALE
+	                        NC entry
+	 ====   ======  ======  ==============================================
+
+	This will optimize the return path for the initial off-link communication
+	that is initiated by a directly connected host, by ensuring that
+	the first-hop router which turns on this setting doesn't have to
+	buffer the initial return packets to do neighbour-solicitation.
+	The prerequisite is that the host is configured to send
+	unsolicited neighbour advertisements on interface bringup.
+	This setting should be used in conjunction with the ndisc_notify setting
+	on the host to satisfy this prerequisite.
+
+	By default this is turned off.
+
 enhanced_dad - BOOLEAN
 	Include a nonce option in the IPv6 neighbor solicitation messages used for
 	duplicate address detection per RFC7527. A received DAD NS will only signal
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 16870f86c74d..918bfea4ef5f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -61,6 +61,7 @@ struct ipv6_devconf {
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
 	__s32		drop_unsolicited_na;
+	__s32		accept_unsolicited_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index d4178dace0bf..549ddeaf788b 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -194,6 +194,7 @@ enum {
 	DEVCONF_IOAM6_ID,
 	DEVCONF_IOAM6_ID_WIDE,
 	DEVCONF_NDISC_EVICT_NOCARRIER,
+	DEVCONF_ACCEPT_UNSOLICITED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1afc4c024981..6473dc84b71d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5587,6 +5587,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IOAM6_ID] = cnf->ioam6_id;
 	array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide;
 	array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier;
+	array[DEVCONF_ACCEPT_UNSOLICITED_NA] = cnf->accept_unsolicited_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -7037,6 +7038,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.extra1		= (void *)SYSCTL_ZERO,
 		.extra2		= (void *)SYSCTL_ONE,
 	},
+	{
+		.procname	= "accept_unsolicited_na",
+		.data		= &ipv6_devconf.accept_unsolicited_na,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= (void *)SYSCTL_ZERO,
+		.extra2		= (void *)SYSCTL_ONE,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index fcb288b0ae13..254addad0dd3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -979,6 +979,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
+	bool create_neigh;
 
 	if (skb->len < sizeof(struct nd_msg)) {
 		ND_PRINTK(2, warn, "NA: packet too short\n");
@@ -999,6 +1000,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	/* For some 802.11 wireless deployments (and possibly other networks),
 	 * there will be a NA proxy and unsolicitd packets are attacks
 	 * and thus should not be accepted.
+	 * drop_unsolicited_na takes precedence over accept_unsolicited_na
 	 */
 	if (!msg->icmph.icmp6_solicited && idev &&
 	    idev->cnf.drop_unsolicited_na)
@@ -1039,7 +1041,23 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		in6_ifa_put(ifp);
 		return;
 	}
-	neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+	/* RFC 9131 updates original Neighbour Discovery RFC 4861.
+	 * An unsolicited NA can now create a neighbour cache entry
+	 * on routers if it has Target LL Address option.
+	 *
+	 * drop   accept  fwding                   behaviour
+	 * ----   ------  ------  ----------------------------------------------
+	 *    1        X       X  Drop NA packet and don't pass up the stack
+	 *    0        0       X  Pass NA packet up the stack, don't update NC
+	 *    0        1       0  Pass NA packet up the stack, don't update NC
+	 *    0        1       1  Pass NA packet up the stack, and add a STALE
+	 *                          NC entry
+	 * Note that we don't do a (daddr == all-routers-mcast) check.
+	 */
+	create_neigh = !msg->icmph.icmp6_solicited && lladdr &&
+		       idev && idev->cnf.forwarding &&
+		       idev->cnf.accept_unsolicited_na;
+	neigh = __neigh_lookup(&nd_tbl, &msg->target, dev, create_neigh);
 
 	if (neigh) {
 		u8 old_flags = neigh->flags;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3fe2515aa616..af7f6e6ff182 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -36,6 +36,7 @@ TEST_PROGS += srv6_end_dt4_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
 TEST_PROGS += vrf_strict_mode_test.sh
 TEST_PROGS += arp_ndisc_evict_nocarrier.sh
+TEST_PROGS += ndisc_unsolicited_na_test.sh
 TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
 TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
 TEST_GEN_FILES =  socket nettest
diff --git a/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh
new file mode 100755
index 000000000000..f508657ee126
--- /dev/null
+++ b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for the accept_unsolicited_na feature to
+# enable RFC9131 behaviour. The following is the test-matrix.
+# drop   accept  fwding                   behaviour
+# ----   ------  ------  ----------------------------------------------
+#    1        X       X  Drop NA packet and don't pass up the stack
+#    0        0       X  Pass NA packet up the stack, don't update NC
+#    0        1       0  Pass NA packet up the stack, don't update NC
+#    0        1       1  Pass NA packet up the stack, and add a STALE
+#                           NC entry
+
+ret=0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PAUSE_ON_FAIL=no
+PAUSE=no
+
+HOST_NS="ns-host"
+ROUTER_NS="ns-router"
+
+HOST_INTF="veth-host"
+ROUTER_INTF="veth-router"
+
+ROUTER_ADDR="2000:20::1"
+HOST_ADDR="2000:20::2"
+SUBNET_WIDTH=64
+ROUTER_ADDR_WITH_MASK="${ROUTER_ADDR}/${SUBNET_WIDTH}"
+HOST_ADDR_WITH_MASK="${HOST_ADDR}/${SUBNET_WIDTH}"
+
+IP_HOST="ip -6 -netns ${HOST_NS}"
+IP_HOST_EXEC="ip netns exec ${HOST_NS}"
+IP_ROUTER="ip -6 -netns ${ROUTER_NS}"
+IP_ROUTER_EXEC="ip netns exec ${ROUTER_NS}"
+
+tcpdump_stdout=
+tcpdump_stderr=
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		printf "    TEST: %-60s  [ OK ]\n" "${msg}"
+		nsuccess=$((nsuccess+1))
+	else
+		ret=1
+		nfail=$((nfail+1))
+		printf "    TEST: %-60s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+		echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+
+	if [ "${PAUSE}" = "yes" ]; then
+		echo
+		echo "hit enter to continue, 'q' to quit"
+		read a
+		[ "$a" = "q" ] && exit 1
+	fi
+}
+
+setup()
+{
+	set -e
+
+	local drop_unsolicited_na=$1
+	local accept_unsolicited_na=$2
+	local forwarding=$3
+
+	# Setup two namespaces and a veth tunnel across them.
+	# On end of the tunnel is a router and the other end is a host.
+	ip netns add ${HOST_NS}
+	ip netns add ${ROUTER_NS}
+	${IP_ROUTER} link add ${ROUTER_INTF} type veth \
+                peer name ${HOST_INTF} netns ${HOST_NS}
+
+	# Enable IPv6 on both router and host, and configure static addresses.
+	# The router here is the DUT
+	# Setup router configuration as specified by the arguments.
+	# forwarding=0 case is to check that a non-router
+	# doesn't add neighbour entries.
+        ROUTER_CONF=net.ipv6.conf.${ROUTER_INTF}
+	${IP_ROUTER_EXEC} sysctl -qw \
+                ${ROUTER_CONF}.forwarding=${forwarding}
+	${IP_ROUTER_EXEC} sysctl -qw \
+                ${ROUTER_CONF}.drop_unsolicited_na=${drop_unsolicited_na}
+	${IP_ROUTER_EXEC} sysctl -qw \
+                ${ROUTER_CONF}.accept_unsolicited_na=${accept_unsolicited_na}
+	${IP_ROUTER_EXEC} sysctl -qw ${ROUTER_CONF}.disable_ipv6=0
+	${IP_ROUTER} addr add ${ROUTER_ADDR_WITH_MASK} dev ${ROUTER_INTF}
+
+	# Turn on ndisc_notify on host interface so that
+	# the host sends unsolicited NAs.
+	HOST_CONF=net.ipv6.conf.${HOST_INTF}
+	${IP_HOST_EXEC} sysctl -qw ${HOST_CONF}.ndisc_notify=1
+	${IP_HOST_EXEC} sysctl -qw ${HOST_CONF}.disable_ipv6=0
+	${IP_HOST} addr add ${HOST_ADDR_WITH_MASK} dev ${HOST_INTF}
+
+	set +e
+}
+
+start_tcpdump() {
+	set -e
+	tcpdump_stdout=`mktemp`
+	tcpdump_stderr=`mktemp`
+	${IP_ROUTER_EXEC} timeout 15s \
+                tcpdump --immediate-mode -tpni ${ROUTER_INTF} -c 1 \
+                "icmp6 && icmp6[0] == 136 && src ${HOST_ADDR}" \
+                > ${tcpdump_stdout} 2> /dev/null
+	set +e
+}
+
+cleanup_tcpdump()
+{
+	set -e
+	[[ ! -z  ${tcpdump_stdout} ]] && rm -f ${tcpdump_stdout}
+	[[ ! -z  ${tcpdump_stderr} ]] && rm -f ${tcpdump_stderr}
+	tcpdump_stdout=
+	tcpdump_stderr=
+	set +e
+}
+
+cleanup()
+{
+	cleanup_tcpdump
+	ip netns del ${HOST_NS}
+	ip netns del ${ROUTER_NS}
+}
+
+link_up() {
+	set -e
+	${IP_ROUTER} link set dev ${ROUTER_INTF} up
+	${IP_HOST} link set dev ${HOST_INTF} up
+	set +e
+}
+
+verify_ndisc() {
+	local drop_unsolicited_na=$1
+	local accept_unsolicited_na=$2
+	local forwarding=$3
+
+	neigh_show_output=$(${IP_ROUTER} neigh show \
+                to ${HOST_ADDR} dev ${ROUTER_INTF} nud stale)
+	if [ ${drop_unsolicited_na} -eq 0 ] && \
+			[ ${accept_unsolicited_na} -eq 1 ] && \
+			[ ${forwarding} -eq 1 ]; then
+		# Neighbour entry expected to be present for 011 case
+		[[ ${neigh_show_output} ]]
+	else
+		# Neighbour entry expected to be absent for all other cases
+		[[ -z ${neigh_show_output} ]]
+	fi
+}
+
+test_unsolicited_na_common()
+{
+	# Setup the test bed, but keep links down
+	setup $1 $2 $3
+
+	# Bring the link up, wait for the NA,
+	# and add a delay to ensure neighbour processing is done.
+	link_up
+	start_tcpdump
+
+	# Verify the neighbour table
+	verify_ndisc $1 $2 $3
+
+}
+
+test_unsolicited_na_combination() {
+	test_unsolicited_na_common $1 $2 $3
+	test_msg=("test_unsolicited_na: "
+		"drop_unsolicited_na=$1 "
+		"accept_unsolicited_na=$2 "
+		"forwarding=$3")
+	log_test $? 0 "${test_msg[*]}"
+	cleanup
+}
+
+test_unsolicited_na_combinations() {
+	# Args: drop_unsolicited_na accept_unsolicited_na forwarding
+
+	# Expect entry
+	test_unsolicited_na_combination 0 1 1
+
+	# Expect no entry
+	test_unsolicited_na_combination 0 0 0
+	test_unsolicited_na_combination 0 0 1
+	test_unsolicited_na_combination 0 1 0
+	test_unsolicited_na_combination 1 0 0
+	test_unsolicited_na_combination 1 0 1
+	test_unsolicited_na_combination 1 1 0
+	test_unsolicited_na_combination 1 1 1
+}
+
+###############################################################################
+# usage
+
+usage()
+{
+	cat <<EOF
+usage: ${0##*/} OPTS
+        -p          Pause on fail
+        -P          Pause after each test before cleanup
+EOF
+}
+
+###############################################################################
+# main
+
+while getopts :pPh o
+do
+	case $o in
+		p) PAUSE_ON_FAIL=yes;;
+		P) PAUSE=yes;;
+		h) usage; exit 0;;
+		*) usage; exit 1;;
+	esac
+done
+
+# make sure we don't pause twice
+[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v tcpdump)" ]; then
+	echo "SKIP: Could not run test without tcpdump tool"
+	exit $ksft_skip
+fi
+
+# start clean
+cleanup &> /dev/null
+
+test_unsolicited_na_combinations
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n"   ${nfail}
+
+exit $ret
-- 
cgit v1.2.3


From 470776c6b03491a3e82c644737a6da5466b8b3eb Mon Sep 17 00:00:00 2001
From: Shelby Heffron <Shelby.Heffron@garmin.com>
Date: Sun, 17 Apr 2022 13:05:08 -0700
Subject: Input: add Marine Navigation Keycodes

Add keycodes that are used by marine navigation devices.

Signed-off-by: Shelby Heffron <Shelby.Heffron@garmin.com>
Link: https://lore.kernel.org/r/20220414015356.1619310-1-Shelby.Heffron@garmin.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input-event-codes.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 7989d9483ea7..dff8e7f17074 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -662,6 +662,27 @@
 /* Select an area of screen to be copied */
 #define KEY_SELECTIVE_SCREENSHOT	0x27a
 
+/* Move the focus to the next or previous user controllable element within a UI container */
+#define KEY_NEXT_ELEMENT               0x27b
+#define KEY_PREVIOUS_ELEMENT           0x27c
+
+/* Toggle Autopilot engagement */
+#define KEY_AUTOPILOT_ENGAGE_TOGGLE    0x27d
+
+/* Shortcut Keys */
+#define KEY_MARK_WAYPOINT              0x27e
+#define KEY_SOS                                0x27f
+#define KEY_NAV_CHART                  0x280
+#define KEY_FISHING_CHART              0x281
+#define KEY_SINGLE_RANGE_RADAR         0x282
+#define KEY_DUAL_RANGE_RADAR           0x283
+#define KEY_RADAR_OVERLAY              0x284
+#define KEY_TRADITIONAL_SONAR          0x285
+#define KEY_CLEARVU_SONAR              0x286
+#define KEY_SIDEVU_SONAR               0x287
+#define KEY_NAV_INFO                   0x288
+#define KEY_BRIGHTNESS_MENU            0x289
+
 /*
  * Some keyboards have keys which do not have a defined meaning, these keys
  * are intended to be programmed / bound to macros by the user. For most
-- 
cgit v1.2.3


From c246f9b5fd617fe487f8b6f18851703f468501d6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:25 +0300
Subject: devlink: add support to create line card and expose to user

Extend the devlink API so the driver is going to be able to create and
destroy linecard instances. There can be multiple line cards per devlink
device. Expose this new type of object over devlink netlink API to the
userspace, with notifications.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |   4 +
 include/uapi/linux/devlink.h |   7 ++
 net/core/devlink.c           | 270 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 280 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index a30180c0988a..7aabdfb3bc6a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -22,6 +22,7 @@
 #include <linux/firmware.h>
 
 struct devlink;
+struct devlink_linecard;
 
 struct devlink_port_phys_attrs {
 	u32 port_number; /* Same value as "split group".
@@ -1536,6 +1537,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_rate_nodes_destroy(struct devlink *devlink);
+struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
+						 unsigned int linecard_index);
+void devlink_linecard_destroy(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b897b80770f6..59c33ed2d3e7 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -131,6 +131,11 @@ enum devlink_command {
 	DEVLINK_CMD_RATE_NEW,
 	DEVLINK_CMD_RATE_DEL,
 
+	DEVLINK_CMD_LINECARD_GET,		/* can dump */
+	DEVLINK_CMD_LINECARD_SET,
+	DEVLINK_CMD_LINECARD_NEW,
+	DEVLINK_CMD_LINECARD_DEL,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -553,6 +558,8 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,	/* u32 */
 
+	DEVLINK_ATTR_LINECARD_INDEX,		/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index aeca13b6e57b..4cdacd74b82a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -54,6 +54,8 @@ struct devlink {
 	struct list_head trap_list;
 	struct list_head trap_group_list;
 	struct list_head trap_policer_list;
+	struct list_head linecard_list;
+	struct mutex linecards_lock; /* protects linecard_list */
 	const struct devlink_ops *ops;
 	u64 features;
 	struct xarray snapshot_ids;
@@ -70,6 +72,13 @@ struct devlink {
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_linecard {
+	struct list_head list;
+	struct devlink *devlink;
+	unsigned int index;
+	refcount_t refcount;
+};
+
 /**
  * struct devlink_resource - devlink resource
  * @name: name of the resource
@@ -397,6 +406,56 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
 		return ERR_PTR(-EINVAL);
 }
 
+static struct devlink_linecard *
+devlink_linecard_get_by_index(struct devlink *devlink,
+			      unsigned int linecard_index)
+{
+	struct devlink_linecard *devlink_linecard;
+
+	list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) {
+		if (devlink_linecard->index == linecard_index)
+			return devlink_linecard;
+	}
+	return NULL;
+}
+
+static bool devlink_linecard_index_exists(struct devlink *devlink,
+					  unsigned int linecard_index)
+{
+	return devlink_linecard_get_by_index(devlink, linecard_index);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+	if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) {
+		u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]);
+		struct devlink_linecard *linecard;
+
+		mutex_lock(&devlink->linecards_lock);
+		linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+		if (linecard)
+			refcount_inc(&linecard->refcount);
+		mutex_unlock(&devlink->linecards_lock);
+		if (!linecard)
+			return ERR_PTR(-ENODEV);
+		return linecard;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+	return devlink_linecard_get_from_attrs(devlink, info->attrs);
+}
+
+static void devlink_linecard_put(struct devlink_linecard *linecard)
+{
+	if (refcount_dec_and_test(&linecard->refcount))
+		kfree(linecard);
+}
+
 struct devlink_sb {
 	struct list_head list;
 	unsigned int index;
@@ -617,16 +676,18 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
 #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT	BIT(1)
 #define DEVLINK_NL_FLAG_NEED_RATE		BIT(2)
 #define DEVLINK_NL_FLAG_NEED_RATE_NODE		BIT(3)
+#define DEVLINK_NL_FLAG_NEED_LINECARD		BIT(4)
 
 /* The per devlink instance lock is taken by default in the pre-doit
  * operation, yet several commands do not require this. The global
  * devlink lock is taken and protects from disruption by user-calls.
  */
-#define DEVLINK_NL_FLAG_NO_LOCK			BIT(4)
+#define DEVLINK_NL_FLAG_NO_LOCK			BIT(5)
 
 static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
+	struct devlink_linecard *linecard;
 	struct devlink_port *devlink_port;
 	struct devlink *devlink;
 	int err;
@@ -669,6 +730,13 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			goto unlock;
 		}
 		info->user_ptr[1] = rate_node;
+	} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) {
+		linecard = devlink_linecard_get_from_info(devlink, info);
+		if (IS_ERR(linecard)) {
+			err = PTR_ERR(linecard);
+			goto unlock;
+		}
+		info->user_ptr[1] = linecard;
 	}
 	return 0;
 
@@ -683,9 +751,14 @@ unlock:
 static void devlink_nl_post_doit(const struct genl_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
+	struct devlink_linecard *linecard;
 	struct devlink *devlink;
 
 	devlink = info->user_ptr[0];
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) {
+		linecard = info->user_ptr[1];
+		devlink_linecard_put(linecard);
+	}
 	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
 		mutex_unlock(&devlink->lock);
 	devlink_put(devlink);
@@ -1964,6 +2037,132 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
 	return err;
 }
 
+static int devlink_nl_linecard_fill(struct sk_buff *msg,
+				    struct devlink *devlink,
+				    struct devlink_linecard *linecard,
+				    enum devlink_command cmd, u32 portid,
+				    u32 seq, int flags,
+				    struct netlink_ext_ack *extack)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_linecard_notify(struct devlink_linecard *linecard,
+				    enum devlink_command cmd)
+{
+	struct devlink *devlink = linecard->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW &&
+		cmd != DEVLINK_CMD_LINECARD_DEL);
+
+	if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0,
+				       NULL);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink_linecard *linecard = info->user_ptr[1];
+	struct devlink *devlink = linecard->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_linecard_fill(msg, devlink, linecard,
+				       DEVLINK_CMD_LINECARD_NEW,
+				       info->snd_portid, info->snd_seq, 0,
+				       info->extack);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
+					      struct netlink_callback *cb)
+{
+	struct devlink_linecard *linecard;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	unsigned long index;
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+		if (!devlink_try_get(devlink))
+			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
+		mutex_lock(&devlink->linecards_lock);
+		list_for_each_entry(linecard, &devlink->linecard_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_linecard_fill(msg, devlink, linecard,
+						       DEVLINK_CMD_LINECARD_NEW,
+						       NETLINK_CB(cb->skb).portid,
+						       cb->nlh->nlmsg_seq,
+						       NLM_F_MULTI,
+						       cb->extack);
+			if (err) {
+				mutex_unlock(&devlink->linecards_lock);
+				devlink_put(devlink);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->linecards_lock);
+retry:
+		devlink_put(devlink);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
 			      struct devlink_sb *devlink_sb,
 			      enum devlink_command cmd, u32 portid,
@@ -8589,6 +8788,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 },
 	[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
@@ -8664,6 +8864,13 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NO_LOCK,
 	},
+	{
+		.cmd = DEVLINK_CMD_LINECARD_GET,
+		.doit = devlink_nl_cmd_linecard_get_doit,
+		.dumpit = devlink_nl_cmd_linecard_get_dumpit,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
+		/* can be retrieved by unprivileged users */
+	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -9043,6 +9250,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	write_pnet(&devlink->_net, net);
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->rate_list);
+	INIT_LIST_HEAD(&devlink->linecard_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
 	INIT_LIST_HEAD(&devlink->resource_list);
@@ -9054,6 +9262,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	INIT_LIST_HEAD(&devlink->trap_policer_list);
 	mutex_init(&devlink->lock);
 	mutex_init(&devlink->reporters_lock);
+	mutex_init(&devlink->linecards_lock);
 	refcount_set(&devlink->refcount, 1);
 	init_completion(&devlink->comp);
 
@@ -9080,10 +9289,14 @@ static void devlink_notify_register(struct devlink *devlink)
 	struct devlink_param_item *param_item;
 	struct devlink_trap_item *trap_item;
 	struct devlink_port *devlink_port;
+	struct devlink_linecard *linecard;
 	struct devlink_rate *rate_node;
 	struct devlink_region *region;
 
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	list_for_each_entry(linecard, &devlink->linecard_list, list)
+		devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+
 	list_for_each_entry(devlink_port, &devlink->port_list, list)
 		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 
@@ -9191,6 +9404,7 @@ void devlink_free(struct devlink *devlink)
 {
 	ASSERT_DEVLINK_NOT_REGISTERED(devlink);
 
+	mutex_destroy(&devlink->linecards_lock);
 	mutex_destroy(&devlink->reporters_lock);
 	mutex_destroy(&devlink->lock);
 	WARN_ON(!list_empty(&devlink->trap_policer_list));
@@ -9203,6 +9417,7 @@ void devlink_free(struct devlink *devlink)
 	WARN_ON(!list_empty(&devlink->dpipe_table_list));
 	WARN_ON(!list_empty(&devlink->sb_list));
 	WARN_ON(!list_empty(&devlink->rate_list));
+	WARN_ON(!list_empty(&devlink->linecard_list));
 	WARN_ON(!list_empty(&devlink->port_list));
 
 	xa_destroy(&devlink->snapshot_ids);
@@ -9747,6 +9962,59 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 	return 0;
 }
 
+/**
+ *	devlink_linecard_create - Create devlink linecard
+ *
+ *	@devlink: devlink
+ *	@linecard_index: driver-specific numerical identifier of the linecard
+ *
+ *	Create devlink linecard instance with provided linecard index.
+ *	Caller can use any indexing, even hw-related one.
+ */
+struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
+						 unsigned int linecard_index)
+{
+	struct devlink_linecard *linecard;
+
+	mutex_lock(&devlink->linecards_lock);
+	if (devlink_linecard_index_exists(devlink, linecard_index)) {
+		mutex_unlock(&devlink->linecards_lock);
+		return ERR_PTR(-EEXIST);
+	}
+
+	linecard = kzalloc(sizeof(*linecard), GFP_KERNEL);
+	if (!linecard) {
+		mutex_unlock(&devlink->linecards_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	linecard->devlink = devlink;
+	linecard->index = linecard_index;
+	list_add_tail(&linecard->list, &devlink->linecard_list);
+	refcount_set(&linecard->refcount, 1);
+	mutex_unlock(&devlink->linecards_lock);
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	return linecard;
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_create);
+
+/**
+ *	devlink_linecard_destroy - Destroy devlink linecard
+ *
+ *	@linecard: devlink linecard
+ */
+void devlink_linecard_destroy(struct devlink_linecard *linecard)
+{
+	struct devlink *devlink = linecard->devlink;
+
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+	mutex_lock(&devlink->linecards_lock);
+	list_del(&linecard->list);
+	mutex_unlock(&devlink->linecards_lock);
+	devlink_linecard_put(linecard);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_destroy);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From fcdc8ce23a309c26a67fc613a741d9b21a248311 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:26 +0300
Subject: devlink: implement line card provisioning

In order to be able to configure all needed stuff on a port/netdevice
of a line card without the line card being present, introduce line card
provisioning. Basically by setting a type, provisioning process will
start and driver is supposed to create a placeholder for instances
(ports/netdevices) for a line card type.

Allow the user to query the supported line card types over line card
get command. Then implement two netlink command SET to allow user to
set/unset the card type.

On the driver API side, add provision/unprovision ops and supported
types array to be advertised. Upon provision op call, the driver should
take care of creating the instances for the particular line card type.
Introduce provision_set/clear() functions to be called by the driver
once the provisioning/unprovisioning is done on its side. These helpers
are not to be called directly due to the async nature of provisioning.

Example:
$ devlink port # No ports are listed
$ devlink lc
pci/0000:01:00.0:
  lc 1 state unprovisioned
    supported_types:
       16x100G
  lc 2 state unprovisioned
    supported_types:
       16x100G
  lc 3 state unprovisioned
    supported_types:
       16x100G
  lc 4 state unprovisioned
    supported_types:
       16x100G
  lc 5 state unprovisioned
    supported_types:
       16x100G
  lc 6 state unprovisioned
    supported_types:
       16x100G
  lc 7 state unprovisioned
    supported_types:
       16x100G
  lc 8 state unprovisioned
    supported_types:
       16x100G

$ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G
$ devlink lc show pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8 state active type 16x100G
    supported_types:
       16x100G
$ devlink port
pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false
pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4
pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4
pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4
pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4
pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4
pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4
pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4
pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4
pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4
pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4
pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4
pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4
pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4
pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4
pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4
pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4

$ devlink lc set pci/0000:01:00.0 lc 8 notype

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../networking/devlink/devlink-linecard.rst        | 121 ++++++++
 Documentation/networking/devlink/index.rst         |   1 +
 include/net/devlink.h                              |  43 ++-
 include/uapi/linux/devlink.h                       |  15 +
 net/core/devlink.c                                 | 322 ++++++++++++++++++++-
 5 files changed, 497 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/networking/devlink/devlink-linecard.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/devlink/devlink-linecard.rst b/Documentation/networking/devlink/devlink-linecard.rst
new file mode 100644
index 000000000000..63ccd17f40ac
--- /dev/null
+++ b/Documentation/networking/devlink/devlink-linecard.rst
@@ -0,0 +1,121 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Devlink Line card
+=================
+
+Background
+==========
+
+The ``devlink-linecard`` mechanism is targeted for manipulation of
+line cards that serve as a detachable PHY modules for modular switch
+system. Following operations are provided:
+
+  * Get a list of supported line card types.
+  * Provision of a slot with specific line card type.
+  * Get and monitor of line card state and its change.
+
+Line card according to the type may contain one or more gearboxes
+to mux the lanes with certain speed to multiple ports with lanes
+of different speed. Line card ensures N:M mapping between
+the switch ASIC modules and physical front panel ports.
+
+Overview
+========
+
+Each line card devlink object is created by device driver,
+according to the physical line card slots available on the device.
+
+Similar to splitter cable, where the device might have no way
+of detection of the splitter cable geometry, the device
+might not have a way to detect line card type. For that devices,
+concept of provisioning is introduced. It allows the user to:
+
+  * Provision a line card slot with certain line card type
+
+    - Device driver would instruct the ASIC to prepare all
+      resources accordingly. The device driver would
+      create all instances, namely devlink port and netdevices
+      that reside on the line card, according to the line card type
+  * Manipulate of line card entities even without line card
+    being physically connected or powered-up
+  * Setup splitter cable on line card ports
+
+    - As on the ordinary ports, user may provision a splitter
+      cable of a certain type, without the need to
+      be physically connected to the port
+  * Configure devlink ports and netdevices
+
+Netdevice carrier is decided as follows:
+
+  * Line card is not inserted or powered-down
+
+    - The carrier is always down
+  * Line card is inserted and powered up
+
+    - The carrier is decided as for ordinary port netdevice
+
+Line card state
+===============
+
+The ``devlink-linecard`` mechanism supports the following line card states:
+
+  * ``unprovisioned``: Line card is not provisioned on the slot.
+  * ``unprovisioning``: Line card slot is currently being unprovisioned.
+  * ``provisioning``: Line card slot is currently in a process of being provisioned
+    with a line card type.
+  * ``provisioning_failed``: Provisioning was not successful.
+  * ``provisioned``: Line card slot is provisioned with a type.
+
+The following diagram provides a general overview of ``devlink-linecard``
+state transitions::
+
+                                          +-------------------------+
+                                          |                         |
+       +---------------------------------->      unprovisioned      |
+       |                                  |                         |
+       |                                  +--------|-------^--------+
+       |                                           |       |
+       |                                           |       |
+       |                                  +--------v-------|--------+
+       |                                  |                         |
+       |                                  |       provisioning      |
+       |                                  |                         |
+       |                                  +------------|------------+
+       |                                               |
+       |                 +-----------------------------+
+       |                 |                             |
+       |    +------------v------------+   +------------v------------+
+       |    |                         |   |                         |
+       +-----   provisioning_failed   |   |       provisioned       |
+       |    |                         |   |                         |
+       |    +------------^------------+   +------------|------------+
+       |                 |                             |
+       |                 |                             |
+       |                 |                +------------v------------+
+       |                 |                |                         |
+       |                 |                |     unprovisioning      |
+       |                 |                |                         |
+       |                 |                +------------|------------+
+       |                 |                             |
+       |                 +-----------------------------+
+       |                                               |
+       +-----------------------------------------------+
+
+
+Example usage
+=============
+
+.. code:: shell
+
+    $ devlink lc show [ DEV [ lc LC_INDEX ] ]
+    $ devlink lc set DEV lc LC_INDEX [ { type LC_TYPE | notype } ]
+
+    # Show current line card configuration and status for all slots:
+    $ devlink lc
+
+    # Set slot 8 to be provisioned with type "16x100G":
+    $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G
+
+    # Set slot 8 to be unprovisioned:
+    $ devlink lc set pci/0000:01:00.0 lc 8 notype
diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst
index c17cdb079611..850715512293 100644
--- a/Documentation/networking/devlink/index.rst
+++ b/Documentation/networking/devlink/index.rst
@@ -39,6 +39,7 @@ general.
    devlink-resource
    devlink-reload
    devlink-trap
+   devlink-linecard
 
 Driver-specific documentation
 -----------------------------
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 7aabdfb3bc6a..3e49d4ff498c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -149,6 +149,40 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
+/**
+ * struct devlink_linecard_ops - Linecard operations
+ * @provision: callback to provision the linecard slot with certain
+ *	       type of linecard. As a result of this operation,
+ *	       driver is expected to eventually (could be after
+ *	       the function call returns) call one of:
+ *	       devlink_linecard_provision_set()
+ *	       devlink_linecard_provision_fail()
+ * @unprovision: callback to unprovision the linecard slot. As a result
+ *		 of this operation, driver is expected to eventually
+ *		 (could be after the function call returns) call
+ *	         devlink_linecard_provision_clear()
+ *	         devlink_linecard_provision_fail()
+ * @same_provision: callback to ask the driver if linecard is already
+ *                  provisioned in the same way user asks this linecard to be
+ *                  provisioned.
+ * @types_count: callback to get number of supported types
+ * @types_get: callback to get next type in list
+ */
+struct devlink_linecard_ops {
+	int (*provision)(struct devlink_linecard *linecard, void *priv,
+			 const char *type, const void *type_priv,
+			 struct netlink_ext_ack *extack);
+	int (*unprovision)(struct devlink_linecard *linecard, void *priv,
+			   struct netlink_ext_ack *extack);
+	bool (*same_provision)(struct devlink_linecard *linecard, void *priv,
+			       const char *type, const void *type_priv);
+	unsigned int (*types_count)(struct devlink_linecard *linecard,
+				    void *priv);
+	void (*types_get)(struct devlink_linecard *linecard,
+			  void *priv, unsigned int index, const char **type,
+			  const void **type_priv);
+};
+
 struct devlink_sb_pool_info {
 	enum devlink_sb_pool_type pool_type;
 	u32 size;
@@ -1537,9 +1571,14 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_rate_nodes_destroy(struct devlink *devlink);
-struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
-						 unsigned int linecard_index);
+struct devlink_linecard *
+devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+				    const char *type);
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 59c33ed2d3e7..de91e4a0d476 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -343,6 +343,18 @@ enum devlink_reload_limit {
 
 #define DEVLINK_RELOAD_LIMITS_VALID_MASK (_BITUL(__DEVLINK_RELOAD_LIMIT_MAX) - 1)
 
+enum devlink_linecard_state {
+	DEVLINK_LINECARD_STATE_UNSPEC,
+	DEVLINK_LINECARD_STATE_UNPROVISIONED,
+	DEVLINK_LINECARD_STATE_UNPROVISIONING,
+	DEVLINK_LINECARD_STATE_PROVISIONING,
+	DEVLINK_LINECARD_STATE_PROVISIONING_FAILED,
+	DEVLINK_LINECARD_STATE_PROVISIONED,
+
+	__DEVLINK_LINECARD_STATE_MAX,
+	DEVLINK_LINECARD_STATE_MAX = __DEVLINK_LINECARD_STATE_MAX - 1
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -559,6 +571,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,	/* u32 */
 
 	DEVLINK_ATTR_LINECARD_INDEX,		/* u32 */
+	DEVLINK_ATTR_LINECARD_STATE,		/* u8 */
+	DEVLINK_ATTR_LINECARD_TYPE,		/* string */
+	DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES,	/* nested */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4cdacd74b82a..b7c3a82fbd4b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -72,11 +72,21 @@ struct devlink {
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_linecard_ops;
+struct devlink_linecard_type;
+
 struct devlink_linecard {
 	struct list_head list;
 	struct devlink *devlink;
 	unsigned int index;
 	refcount_t refcount;
+	const struct devlink_linecard_ops *ops;
+	void *priv;
+	enum devlink_linecard_state state;
+	struct mutex state_lock; /* Protects state */
+	const char *type;
+	struct devlink_linecard_type *types;
+	unsigned int types_count;
 };
 
 /**
@@ -452,8 +462,10 @@ devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
 
 static void devlink_linecard_put(struct devlink_linecard *linecard)
 {
-	if (refcount_dec_and_test(&linecard->refcount))
+	if (refcount_dec_and_test(&linecard->refcount)) {
+		mutex_destroy(&linecard->state_lock);
 		kfree(linecard);
+	}
 }
 
 struct devlink_sb {
@@ -2037,6 +2049,11 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
 	return err;
 }
 
+struct devlink_linecard_type {
+	const char *type;
+	const void *priv;
+};
+
 static int devlink_nl_linecard_fill(struct sk_buff *msg,
 				    struct devlink *devlink,
 				    struct devlink_linecard *linecard,
@@ -2044,7 +2061,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 				    u32 seq, int flags,
 				    struct netlink_ext_ack *extack)
 {
+	struct devlink_linecard_type *linecard_type;
+	struct nlattr *attr;
 	void *hdr;
+	int i;
 
 	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
 	if (!hdr)
@@ -2054,6 +2074,27 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		goto nla_put_failure;
 	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
 		goto nla_put_failure;
+	if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state))
+		goto nla_put_failure;
+	if (linecard->type &&
+	    nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type))
+		goto nla_put_failure;
+
+	if (linecard->types_count) {
+		attr = nla_nest_start(msg,
+				      DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES);
+		if (!attr)
+			goto nla_put_failure;
+		for (i = 0; i < linecard->types_count; i++) {
+			linecard_type = &linecard->types[i];
+			if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE,
+					   linecard_type->type)) {
+				nla_nest_cancel(msg, attr);
+				goto nla_put_failure;
+			}
+		}
+		nla_nest_end(msg, attr);
+	}
 
 	genlmsg_end(msg, hdr);
 	return 0;
@@ -2103,10 +2144,12 @@ static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb,
 	if (!msg)
 		return -ENOMEM;
 
+	mutex_lock(&linecard->state_lock);
 	err = devlink_nl_linecard_fill(msg, devlink, linecard,
 				       DEVLINK_CMD_LINECARD_NEW,
 				       info->snd_portid, info->snd_seq, 0,
 				       info->extack);
+	mutex_unlock(&linecard->state_lock);
 	if (err) {
 		nlmsg_free(msg);
 		return err;
@@ -2139,12 +2182,14 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
 				idx++;
 				continue;
 			}
+			mutex_lock(&linecard->state_lock);
 			err = devlink_nl_linecard_fill(msg, devlink, linecard,
 						       DEVLINK_CMD_LINECARD_NEW,
 						       NETLINK_CB(cb->skb).portid,
 						       cb->nlh->nlmsg_seq,
 						       NLM_F_MULTI,
 						       cb->extack);
+			mutex_unlock(&linecard->state_lock);
 			if (err) {
 				mutex_unlock(&devlink->linecards_lock);
 				devlink_put(devlink);
@@ -2163,6 +2208,163 @@ out:
 	return msg->len;
 }
 
+static struct devlink_linecard_type *
+devlink_linecard_type_lookup(struct devlink_linecard *linecard,
+			     const char *type)
+{
+	struct devlink_linecard_type *linecard_type;
+	int i;
+
+	for (i = 0; i < linecard->types_count; i++) {
+		linecard_type = &linecard->types[i];
+		if (!strcmp(type, linecard_type->type))
+			return linecard_type;
+	}
+	return NULL;
+}
+
+static int devlink_linecard_type_set(struct devlink_linecard *linecard,
+				     const char *type,
+				     struct netlink_ext_ack *extack)
+{
+	const struct devlink_linecard_ops *ops = linecard->ops;
+	struct devlink_linecard_type *linecard_type;
+	int err;
+
+	mutex_lock(&linecard->state_lock);
+	if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
+		err = -EBUSY;
+		goto out;
+	}
+	if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
+		err = -EBUSY;
+		goto out;
+	}
+
+	linecard_type = devlink_linecard_type_lookup(linecard, type);
+	if (!linecard_type) {
+		NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided");
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED &&
+	    linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned");
+		err = -EBUSY;
+		/* Check if the line card is provisioned in the same
+		 * way the user asks. In case it is, make the operation
+		 * to return success.
+		 */
+		if (ops->same_provision &&
+		    ops->same_provision(linecard, linecard->priv,
+					linecard_type->type,
+					linecard_type->priv))
+			err = 0;
+		goto out;
+	}
+
+	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING;
+	linecard->type = linecard_type->type;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+	err = ops->provision(linecard, linecard->priv, linecard_type->type,
+			     linecard_type->priv, extack);
+	if (err) {
+		/* Provisioning failed. Assume the linecard is unprovisioned
+		 * for future operations.
+		 */
+		mutex_lock(&linecard->state_lock);
+		linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+		linecard->type = NULL;
+		devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+		mutex_unlock(&linecard->state_lock);
+	}
+	return err;
+
+out:
+	mutex_unlock(&linecard->state_lock);
+	return err;
+}
+
+static int devlink_linecard_type_unset(struct devlink_linecard *linecard,
+				       struct netlink_ext_ack *extack)
+{
+	int err;
+
+	mutex_lock(&linecard->state_lock);
+	if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
+		err = -EBUSY;
+		goto out;
+	}
+	if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
+		err = -EBUSY;
+		goto out;
+	}
+	if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+		linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+		linecard->type = NULL;
+		devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+		err = 0;
+		goto out;
+	}
+
+	if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) {
+		NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned");
+		err = 0;
+		goto out;
+	}
+	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+	err = linecard->ops->unprovision(linecard, linecard->priv,
+					 extack);
+	if (err) {
+		/* Unprovisioning failed. Assume the linecard is unprovisioned
+		 * for future operations.
+		 */
+		mutex_lock(&linecard->state_lock);
+		linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+		linecard->type = NULL;
+		devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+		mutex_unlock(&linecard->state_lock);
+	}
+	return err;
+
+out:
+	mutex_unlock(&linecard->state_lock);
+	return err;
+}
+
+static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink_linecard *linecard = info->user_ptr[1];
+	struct netlink_ext_ack *extack = info->extack;
+	int err;
+
+	if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) {
+		const char *type;
+
+		type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]);
+		if (strcmp(type, "")) {
+			err = devlink_linecard_type_set(linecard, type, extack);
+			if (err)
+				return err;
+		} else {
+			err = devlink_linecard_type_unset(linecard, extack);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
 			      struct devlink_sb *devlink_sb,
 			      enum devlink_command cmd, u32 portid,
@@ -8789,6 +8991,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
@@ -8871,6 +9074,12 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
 		/* can be retrieved by unprivileged users */
 	},
+	{
+		.cmd = DEVLINK_CMD_LINECARD_SET,
+		.doit = devlink_nl_cmd_linecard_set_doit,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
+	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -9962,19 +10171,56 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 	return 0;
 }
 
+static int devlink_linecard_types_init(struct devlink_linecard *linecard)
+{
+	struct devlink_linecard_type *linecard_type;
+	unsigned int count;
+	int i;
+
+	count = linecard->ops->types_count(linecard, linecard->priv);
+	linecard->types = kmalloc_array(count, sizeof(*linecard_type),
+					GFP_KERNEL);
+	if (!linecard->types)
+		return -ENOMEM;
+	linecard->types_count = count;
+
+	for (i = 0; i < count; i++) {
+		linecard_type = &linecard->types[i];
+		linecard->ops->types_get(linecard, linecard->priv, i,
+					 &linecard_type->type,
+					 &linecard_type->priv);
+	}
+	return 0;
+}
+
+static void devlink_linecard_types_fini(struct devlink_linecard *linecard)
+{
+	kfree(linecard->types);
+}
+
 /**
  *	devlink_linecard_create - Create devlink linecard
  *
  *	@devlink: devlink
  *	@linecard_index: driver-specific numerical identifier of the linecard
+ *	@ops: linecards ops
+ *	@priv: user priv pointer
  *
  *	Create devlink linecard instance with provided linecard index.
  *	Caller can use any indexing, even hw-related one.
+ *
+ *	Return: Line card structure or an ERR_PTR() encoded error code.
  */
-struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
-						 unsigned int linecard_index)
+struct devlink_linecard *
+devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+			const struct devlink_linecard_ops *ops, void *priv)
 {
 	struct devlink_linecard *linecard;
+	int err;
+
+	if (WARN_ON(!ops || !ops->provision || !ops->unprovision ||
+		    !ops->types_count || !ops->types_get))
+		return ERR_PTR(-EINVAL);
 
 	mutex_lock(&devlink->linecards_lock);
 	if (devlink_linecard_index_exists(devlink, linecard_index)) {
@@ -9990,6 +10236,19 @@ struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
 
 	linecard->devlink = devlink;
 	linecard->index = linecard_index;
+	linecard->ops = ops;
+	linecard->priv = priv;
+	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+	mutex_init(&linecard->state_lock);
+
+	err = devlink_linecard_types_init(linecard);
+	if (err) {
+		mutex_destroy(&linecard->state_lock);
+		kfree(linecard);
+		mutex_unlock(&devlink->linecards_lock);
+		return ERR_PTR(err);
+	}
+
 	list_add_tail(&linecard->list, &devlink->linecard_list);
 	refcount_set(&linecard->refcount, 1);
 	mutex_unlock(&devlink->linecards_lock);
@@ -10010,11 +10269,68 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard)
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
 	mutex_lock(&devlink->linecards_lock);
 	list_del(&linecard->list);
+	devlink_linecard_types_fini(linecard);
 	mutex_unlock(&devlink->linecards_lock);
 	devlink_linecard_put(linecard);
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_destroy);
 
+/**
+ *	devlink_linecard_provision_set - Set provisioning on linecard
+ *
+ *	@linecard: devlink linecard
+ *	@type: linecard type
+ *
+ *	This is either called directly from the provision() op call or
+ *	as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+				    const char *type)
+{
+	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->type && strcmp(linecard->type, type));
+	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+	linecard->type = type;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
+
+/**
+ *	devlink_linecard_provision_clear - Clear provisioning on linecard
+ *
+ *	@linecard: devlink linecard
+ *
+ *	This is either called directly from the unprovision() op call or
+ *	as a result of the unprovision() op call asynchronously.
+ */
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
+{
+	mutex_lock(&linecard->state_lock);
+	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+	linecard->type = NULL;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
+
+/**
+ *	devlink_linecard_provision_fail - Fail provisioning on linecard
+ *
+ *	@linecard: devlink linecard
+ *
+ *	This is either called directly from the provision() op call or
+ *	as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
+{
+	mutex_lock(&linecard->state_lock);
+	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From fc9f50d5b366cd9f35bdee22fe3f8d77833cb1d8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:27 +0300
Subject: devlink: implement line card active state

Allow driver to mark a line card as active. Expose this state to the
userspace over devlink netlink interface with proper notifications.
'active' state means that line card was plugged in after
being provisioned.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../networking/devlink/devlink-linecard.rst        | 11 +++---
 include/net/devlink.h                              |  2 ++
 include/uapi/linux/devlink.h                       |  1 +
 net/core/devlink.c                                 | 41 ++++++++++++++++++++++
 4 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/devlink/devlink-linecard.rst b/Documentation/networking/devlink/devlink-linecard.rst
index 63ccd17f40ac..6c0b8928bc13 100644
--- a/Documentation/networking/devlink/devlink-linecard.rst
+++ b/Documentation/networking/devlink/devlink-linecard.rst
@@ -66,6 +66,7 @@ The ``devlink-linecard`` mechanism supports the following line card states:
     with a line card type.
   * ``provisioning_failed``: Provisioning was not successful.
   * ``provisioned``: Line card slot is provisioned with a type.
+  * ``active``: Line card is powered-up and active.
 
 The following diagram provides a general overview of ``devlink-linecard``
 state transitions::
@@ -85,11 +86,11 @@ state transitions::
        |                                               |
        |                 +-----------------------------+
        |                 |                             |
-       |    +------------v------------+   +------------v------------+
-       |    |                         |   |                         |
-       +-----   provisioning_failed   |   |       provisioned       |
-       |    |                         |   |                         |
-       |    +------------^------------+   +------------|------------+
+       |    +------------v------------+   +------------v------------+   +-------------------------+
+       |    |                         |   |                         ---->                         |
+       +-----   provisioning_failed   |   |       provisioned       |   |         active          |
+       |    |                         |   |                         <----                         |
+       |    +------------^------------+   +------------|------------+   +-------------------------+
        |                 |                             |
        |                 |                             |
        |                 |                +------------v------------+
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 3e49d4ff498c..d8061a11fee6 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1579,6 +1579,8 @@ void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
+void devlink_linecard_activate(struct devlink_linecard *linecard);
+void devlink_linecard_deactivate(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index de91e4a0d476..b3d40a5d72ff 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -350,6 +350,7 @@ enum devlink_linecard_state {
 	DEVLINK_LINECARD_STATE_PROVISIONING,
 	DEVLINK_LINECARD_STATE_PROVISIONING_FAILED,
 	DEVLINK_LINECARD_STATE_PROVISIONED,
+	DEVLINK_LINECARD_STATE_ACTIVE,
 
 	__DEVLINK_LINECARD_STATE_MAX,
 	DEVLINK_LINECARD_STATE_MAX = __DEVLINK_LINECARD_STATE_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b7c3a82fbd4b..aec0a517282c 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10331,6 +10331,47 @@ void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
 
+/**
+ *	devlink_linecard_activate - Set linecard active
+ *
+ *	@linecard: devlink linecard
+ */
+void devlink_linecard_activate(struct devlink_linecard *linecard)
+{
+	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED);
+	linecard->state = DEVLINK_LINECARD_STATE_ACTIVE;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_activate);
+
+/**
+ *	devlink_linecard_deactivate - Set linecard inactive
+ *
+ *	@linecard: devlink linecard
+ */
+void devlink_linecard_deactivate(struct devlink_linecard *linecard)
+{
+	mutex_lock(&linecard->state_lock);
+	switch (linecard->state) {
+	case DEVLINK_LINECARD_STATE_ACTIVE:
+		linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+		devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+		break;
+	case DEVLINK_LINECARD_STATE_UNPROVISIONING:
+		/* Line card is being deactivated as part
+		 * of unprovisioning flow.
+		 */
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From 38a6f0865796e26fc38fff4858f681d9ae76fa0f Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sat, 16 Apr 2022 00:40:46 +0800
Subject: net: sched: support hash selecting tx queue

This patch allows users to pick queue_mapping, range
from A to B. Then we can load balance packets from A
to B tx queue. The range is an unsigned 16bit value
in decimal format.

$ tc filter ... action skbedit queue_mapping skbhash A B

"skbedit queue_mapping QUEUE_MAPPING" (from "man 8 tc-skbedit")
is enhanced with flags: SKBEDIT_F_TXQ_SKBHASH

  +----+      +----+      +----+
  | P1 |      | P2 |      | Pn |
  +----+      +----+      +----+
    |           |           |
    +-----------+-----------+
                |
                | clsact/skbedit
                |      MQ
                v
    +-----------+-----------+
    | q0        | qn        | qm
    v           v           v
  HTB/FQ       FIFO   ...  FIFO

For example:
If P1 sends out packets to different Pods on other host, and
we want distribute flows from qn - qm. Then we can use skb->hash
as hash.

setup commands:
$ NETDEV=eth0
$ ip netns add n1
$ ip link add ipv1 link $NETDEV type ipvlan mode l2
$ ip link set ipv1 netns n1
$ ip netns exec n1 ifconfig ipv1 2.2.2.100/24 up

$ tc qdisc add dev $NETDEV clsact
$ tc filter add dev $NETDEV egress protocol ip prio 1 \
        flower skip_hw src_ip 2.2.2.100 action skbedit queue_mapping skbhash 2 6
$ tc qdisc add dev $NETDEV handle 1: root mq
$ tc qdisc add dev $NETDEV parent 1:1 handle 2: htb
$ tc class add dev $NETDEV parent 2: classid 2:1 htb rate 100kbit
$ tc class add dev $NETDEV parent 2: classid 2:2 htb rate 200kbit
$ tc qdisc add dev $NETDEV parent 1:2 tbf rate 100mbit burst 100mb latency 1
$ tc qdisc add dev $NETDEV parent 1:3 pfifo
$ tc qdisc add dev $NETDEV parent 1:4 pfifo
$ tc qdisc add dev $NETDEV parent 1:5 pfifo
$ tc qdisc add dev $NETDEV parent 1:6 pfifo
$ tc qdisc add dev $NETDEV parent 1:7 pfifo

$ ip netns exec n1 iperf3 -c 2.2.2.1 -i 1 -t 10 -P 10

pick txqueue from 2 - 6:
$ ethtool -S $NETDEV | grep -i tx_queue_[0-9]_bytes
     tx_queue_0_bytes: 42
     tx_queue_1_bytes: 0
     tx_queue_2_bytes: 11442586444
     tx_queue_3_bytes: 7383615334
     tx_queue_4_bytes: 3981365579
     tx_queue_5_bytes: 3983235051
     tx_queue_6_bytes: 6706236461
     tx_queue_7_bytes: 42
     tx_queue_8_bytes: 0
     tx_queue_9_bytes: 0

txqueues 2 - 6 are mapped to classid 1:3 - 1:7
$ tc -s class show dev $NETDEV
...
class mq 1:3 root leaf 8002:
 Sent 11949133672 bytes 7929798 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:4 root leaf 8003:
 Sent 7710449050 bytes 5117279 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:5 root leaf 8004:
 Sent 4157648675 bytes 2758990 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:6 root leaf 8005:
 Sent 4159632195 bytes 2759990 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:7 root leaf 8006:
 Sent 7003169603 bytes 4646912 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
...

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Talal Ahmad <talalahmad@google.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Antoine Tenart <atenart@kernel.org>
Cc: Wei Wang <weiwan@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_act/tc_skbedit.h        |  1 +
 include/uapi/linux/tc_act/tc_skbedit.h |  2 ++
 net/sched/act_skbedit.c                | 49 ++++++++++++++++++++++++++++++++--
 3 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index cab8229b9bed..dc1079f28e13 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -17,6 +17,7 @@ struct tcf_skbedit_params {
 	u32 mark;
 	u32 mask;
 	u16 queue_mapping;
+	u16 mapping_mod;
 	u16 ptype;
 	struct rcu_head rcu;
 };
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index 800e93377218..6cb6101208d0 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -29,6 +29,7 @@
 #define SKBEDIT_F_PTYPE			0x8
 #define SKBEDIT_F_MASK			0x10
 #define SKBEDIT_F_INHERITDSFIELD	0x20
+#define SKBEDIT_F_TXQ_SKBHASH		0x40
 
 struct tc_skbedit {
 	tc_gen;
@@ -45,6 +46,7 @@ enum {
 	TCA_SKBEDIT_PTYPE,
 	TCA_SKBEDIT_MASK,
 	TCA_SKBEDIT_FLAGS,
+	TCA_SKBEDIT_QUEUE_MAPPING_MAX,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 1c5fdb6e7c2f..e3bd11dfe1ca 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,20 @@
 static unsigned int skbedit_net_id;
 static struct tc_action_ops act_skbedit_ops;
 
+static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params,
+			    struct sk_buff *skb)
+{
+	u16 queue_mapping = params->queue_mapping;
+
+	if (params->flags & SKBEDIT_F_TXQ_SKBHASH) {
+		u32 hash = skb_get_hash(skb);
+
+		queue_mapping += hash % params->mapping_mod;
+	}
+
+	return netdev_cap_txqueue(skb->dev, queue_mapping);
+}
+
 static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
 			   struct tcf_result *res)
 {
@@ -62,7 +76,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
 #ifdef CONFIG_NET_EGRESS
 		netdev_xmit_skip_txqueue(true);
 #endif
-		skb_set_queue_mapping(skb, params->queue_mapping);
+		skb_set_queue_mapping(skb, tcf_skbedit_hash(params, skb));
 	}
 	if (params->flags & SKBEDIT_F_MARK) {
 		skb->mark &= ~params->mask;
@@ -96,6 +110,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MASK]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_FLAGS]		= { .len = sizeof(u64) },
+	[TCA_SKBEDIT_QUEUE_MAPPING_MAX]	= { .len = sizeof(u16) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -112,6 +127,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	struct tcf_skbedit *d;
 	u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
 	u16 *queue_mapping = NULL, *ptype = NULL;
+	u16 mapping_mod = 1;
 	bool exists = false;
 	int ret = 0, err;
 	u32 index;
@@ -157,6 +173,25 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
 		u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
 
+		if (*pure_flags & SKBEDIT_F_TXQ_SKBHASH) {
+			u16 *queue_mapping_max;
+
+			if (!tb[TCA_SKBEDIT_QUEUE_MAPPING] ||
+			    !tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]) {
+				NL_SET_ERR_MSG_MOD(extack, "Missing required range of queue_mapping.");
+				return -EINVAL;
+			}
+
+			queue_mapping_max =
+				nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]);
+			if (*queue_mapping_max < *queue_mapping) {
+				NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid, max < min.");
+				return -EINVAL;
+			}
+
+			mapping_mod = *queue_mapping_max - *queue_mapping + 1;
+			flags |= SKBEDIT_F_TXQ_SKBHASH;
+		}
 		if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
 			flags |= SKBEDIT_F_INHERITDSFIELD;
 	}
@@ -208,8 +243,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	params_new->flags = flags;
 	if (flags & SKBEDIT_F_PRIORITY)
 		params_new->priority = *priority;
-	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+	if (flags & SKBEDIT_F_QUEUE_MAPPING) {
 		params_new->queue_mapping = *queue_mapping;
+		params_new->mapping_mod = mapping_mod;
+	}
 	if (flags & SKBEDIT_F_MARK)
 		params_new->mark = *mark;
 	if (flags & SKBEDIT_F_PTYPE)
@@ -276,6 +313,13 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		goto nla_put_failure;
 	if (params->flags & SKBEDIT_F_INHERITDSFIELD)
 		pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+	if (params->flags & SKBEDIT_F_TXQ_SKBHASH) {
+		if (nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING_MAX,
+				params->queue_mapping + params->mapping_mod - 1))
+			goto nla_put_failure;
+
+		pure_flags |= SKBEDIT_F_TXQ_SKBHASH;
+	}
 	if (pure_flags != 0 &&
 	    nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
 		goto nla_put_failure;
@@ -325,6 +369,7 @@ static size_t tcf_skbedit_get_fill_size(const struct tc_action *act)
 	return nla_total_size(sizeof(struct tc_skbedit))
 		+ nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */
 		+ nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */
+		+ nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING_MAX */
 		+ nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */
 		+ nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */
 		+ nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */
-- 
cgit v1.2.3


From b617be33502d2bfefffef71924c7a7ba50264ff6 Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch.siach@siklu.com>
Date: Mon, 11 Apr 2022 21:45:27 +0300
Subject: spi: add SPI_RX_CPHA_FLIP mode bit

Some SPI devices latch MOSI bits on one clock phase, but produce valid
MISO bits on the other phase. Add SPI_RX_CPHA_FLIP mode to instruct the
controller driver to flip CPHA for Rx (MISO) only transfers.

Signed-off-by: Baruch Siach <baruch.siach@siklu.com>
Link: https://lore.kernel.org/r/a715ca92713ca02071f33dcca9960a66a03c949a.1649702729.git.baruch@tkos.co.il
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/linux/spi/spi.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/spi/spi.h b/include/uapi/linux/spi/spi.h
index 236a85f08ded..9d5f58059703 100644
--- a/include/uapi/linux/spi/spi.h
+++ b/include/uapi/linux/spi/spi.h
@@ -27,6 +27,7 @@
 #define	SPI_TX_OCTAL		_BITUL(13)	/* transmit with 8 wires */
 #define	SPI_RX_OCTAL		_BITUL(14)	/* receive with 8 wires */
 #define	SPI_3WIRE_HIZ		_BITUL(15)	/* high impedance turnaround */
+#define	SPI_RX_CPHA_FLIP	_BITUL(16)	/* flip CPHA on Rx only xfer */
 
 /*
  * All the bits defined above should be covered by SPI_MODE_USER_MASK.
@@ -36,6 +37,6 @@
  * These bits must not overlap. A static assert check should make sure of that.
  * If adding extra bits, make sure to increase the bit index below as well.
  */
-#define SPI_MODE_USER_MASK	(_BITUL(16) - 1)
+#define SPI_MODE_USER_MASK	(_BITUL(17) - 1)
 
 #endif /* _UAPI_SPI_H */
-- 
cgit v1.2.3


From b4000312822615ba2222e368188029e9b725dbf4 Mon Sep 17 00:00:00 2001
From: Boris Sukholitko <boris.sukholitko@broadcom.com>
Date: Tue, 19 Apr 2022 11:14:33 +0300
Subject: net/sched: flower: Add number of vlan tags filter

These are bookkeeping parts of the new num_of_vlans filter.
Defines, dump, load and set are being done here.

Signed-off-by: Boris Sukholitko <boris.sukholitko@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 404f97fb239c..9a2ee1e39fad 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -587,6 +587,8 @@ enum {
 	TCA_FLOWER_KEY_HASH,		/* u32 */
 	TCA_FLOWER_KEY_HASH_MASK,	/* u32 */
 
+	TCA_FLOWER_KEY_NUM_OF_VLANS,    /* u8 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 86fd0420ac4f..4ec4d742e82f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -72,6 +72,7 @@ struct fl_flow_key {
 	} tp_range;
 	struct flow_dissector_key_ct ct;
 	struct flow_dissector_key_hash hash;
+	struct flow_dissector_key_num_of_vlans num_of_vlans;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -712,6 +713,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_FLAGS]		= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_HASH]		= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_HASH_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_NUM_OF_VLANS]	= { .type = NLA_U8 },
 
 };
 
@@ -1615,6 +1617,11 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 	fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
 		       mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
 		       sizeof(key->eth.src));
+	fl_set_key_val(tb, &key->num_of_vlans,
+		       TCA_FLOWER_KEY_NUM_OF_VLANS,
+		       &mask->num_of_vlans,
+		       TCA_FLOWER_UNSPEC,
+		       sizeof(key->num_of_vlans));
 
 	if (is_vlan_key(tb[TCA_FLOWER_KEY_ETH_TYPE], &ethertype, key, mask)) {
 		fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
@@ -1906,6 +1913,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_CT, ct);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_HASH, hash);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -2994,6 +3003,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 			    sizeof(key->basic.n_proto)))
 		goto nla_put_failure;
 
+	if (mask->num_of_vlans.num_of_vlans) {
+		if (nla_put_u8(skb, TCA_FLOWER_KEY_NUM_OF_VLANS, key->num_of_vlans.num_of_vlans))
+			goto nla_put_failure;
+	}
+
 	if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 9e4ab6c89109472082616f8d2f6ada7deaffe161 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 19 Apr 2022 12:22:19 +0100
Subject: arm64/sme: Implement vector length configuration prctl()s

As for SVE provide a prctl() interface which allows processes to
configure their SME vector length.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20220419112247.711548-12-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h      |  4 ++++
 arch/arm64/include/asm/processor.h   |  4 +++-
 arch/arm64/include/asm/thread_info.h |  1 +
 arch/arm64/kernel/fpsimd.c           | 32 ++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h           |  9 +++++++++
 kernel/sys.c                         | 12 ++++++++++++
 6 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 32cd682258d9..38fd6aab7feb 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -288,6 +288,8 @@ static inline int sme_max_virtualisable_vl(void)
 }
 
 extern unsigned int sme_get_vl(void);
+extern int sme_set_current_vl(unsigned long arg);
+extern int sme_get_current_vl(void);
 
 #else
 
@@ -299,6 +301,8 @@ static inline void sme_setup(void) { }
 static inline unsigned int sme_get_vl(void) { return 0; }
 static inline int sme_max_vl(void) { return 0; }
 static inline int sme_max_virtualisable_vl(void) { return 0; }
+static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; }
+static inline int sme_get_current_vl(void) { return -EINVAL; }
 
 #endif /* ! CONFIG_ARM64_SME */
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index abf34a9c2eab..7a57cbff8a03 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -355,9 +355,11 @@ extern void __init minsigstksz_setup(void);
  */
 #include <asm/fpsimd.h>
 
-/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
+/* Userspace interface for PR_S[MV]E_{SET,GET}_VL prctl()s: */
 #define SVE_SET_VL(arg)	sve_set_current_vl(arg)
 #define SVE_GET_VL()	sve_get_current_vl()
+#define SME_SET_VL(arg)	sme_set_current_vl(arg)
+#define SME_GET_VL()	sme_get_current_vl()
 
 /* PR_PAC_RESET_KEYS prctl */
 #define PAC_RESET_KEYS(tsk, arg)	ptrauth_prctl_reset_keys(tsk, arg)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index e1317b7c4525..4e6b58dcd6f9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -82,6 +82,7 @@ int arch_dup_task_struct(struct task_struct *dst,
 #define TIF_SVE_VL_INHERIT	24	/* Inherit SVE vl_onexec across exec */
 #define TIF_SSBD		25	/* Wants SSB mitigation */
 #define TIF_TAGGED_ADDR		26	/* Allow tagged user addresses */
+#define TIF_SME_VL_INHERIT	28	/* Inherit SME vl_onexec across exec */
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 754a96563f6f..39f44fcb9b99 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -149,6 +149,8 @@ static unsigned int vec_vl_inherit_flag(enum vec_type type)
 	switch (type) {
 	case ARM64_VEC_SVE:
 		return TIF_SVE_VL_INHERIT;
+	case ARM64_VEC_SME:
+		return TIF_SME_VL_INHERIT;
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -807,6 +809,36 @@ int sve_get_current_vl(void)
 	return vec_prctl_status(ARM64_VEC_SVE, 0);
 }
 
+#ifdef CONFIG_ARM64_SME
+/* PR_SME_SET_VL */
+int sme_set_current_vl(unsigned long arg)
+{
+	unsigned long vl, flags;
+	int ret;
+
+	vl = arg & PR_SME_VL_LEN_MASK;
+	flags = arg & ~vl;
+
+	if (!system_supports_sme() || is_compat_task())
+		return -EINVAL;
+
+	ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags);
+	if (ret)
+		return ret;
+
+	return vec_prctl_status(ARM64_VEC_SME, flags);
+}
+
+/* PR_SME_GET_VL */
+int sme_get_current_vl(void)
+{
+	if (!system_supports_sme() || is_compat_task())
+		return -EINVAL;
+
+	return vec_prctl_status(ARM64_VEC_SME, 0);
+}
+#endif /* CONFIG_ARM64_SME */
+
 static void vec_probe_vqs(struct vl_info *info,
 			  DECLARE_BITMAP(map, SVE_VQ_MAX))
 {
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index e998764f0262..a5e06dcbba13 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -272,6 +272,15 @@ struct prctl_mm_map {
 # define PR_SCHED_CORE_SCOPE_THREAD_GROUP	1
 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP	2
 
+/* arm64 Scalable Matrix Extension controls */
+/* Flag values must be in sync with SVE versions */
+#define PR_SME_SET_VL			63	/* set task vector length */
+# define PR_SME_SET_VL_ONEXEC		(1 << 18) /* defer effect until exec */
+#define PR_SME_GET_VL			64	/* get task vector length */
+/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */
+# define PR_SME_VL_LEN_MASK		0xffff
+# define PR_SME_VL_INHERIT		(1 << 17) /* inherit across exec */
+
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 374f83e95239..b911fa6d81ab 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,6 +117,12 @@
 #ifndef SVE_GET_VL
 # define SVE_GET_VL()		(-EINVAL)
 #endif
+#ifndef SME_SET_VL
+# define SME_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SME_GET_VL
+# define SME_GET_VL()		(-EINVAL)
+#endif
 #ifndef PAC_RESET_KEYS
 # define PAC_RESET_KEYS(a, b)	(-EINVAL)
 #endif
@@ -2541,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
+	case PR_SME_SET_VL:
+		error = SME_SET_VL(arg2);
+		break;
+	case PR_SME_GET_VL:
+		error = SME_GET_VL();
+		break;
 	case PR_GET_SPECULATION_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-- 
cgit v1.2.3


From e12310a0d30f260b26297bc8d7c95769489af038 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 19 Apr 2022 12:22:28 +0100
Subject: arm64/sme: Implement ptrace support for streaming mode SVE registers

The streaming mode SVE registers are represented using the same data
structures as for SVE but since the vector lengths supported and in use
may not be the same as SVE we represent them with a new type NT_ARM_SSVE.
Unfortunately we only have a single 16 bit reserved field available in
the header so there is no space to fit the current and maximum vector
length for both standard and streaming SVE mode without redefining the
structure in a way the creates a complicatd and fragile ABI. Since FFR
is not present in streaming mode it is read and written as zero.

Setting NT_ARM_SSVE registers will put the task into streaming mode,
similarly setting NT_ARM_SVE registers will exit it. Reads that do not
correspond to the current mode of the task will return the header with
no register data. For compatibility reasons on write setting no flag for
the register type will be interpreted as setting SVE registers, though
users can provide no register data as an alternative mechanism for doing
so.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20220419112247.711548-21-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h      |   1 +
 arch/arm64/include/uapi/asm/ptrace.h |  13 ++-
 arch/arm64/kernel/fpsimd.c           |  31 +++--
 arch/arm64/kernel/ptrace.c           | 214 +++++++++++++++++++++++++++--------
 include/uapi/linux/elf.h             |   1 +
 5 files changed, 201 insertions(+), 59 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 6c33bc832ed4..5afcd0709aae 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -144,6 +144,7 @@ struct vl_info {
 extern void sve_alloc(struct task_struct *task);
 extern void fpsimd_release_task(struct task_struct *task);
 extern void fpsimd_sync_to_sve(struct task_struct *task);
+extern void fpsimd_force_sync_to_sve(struct task_struct *task);
 extern void sve_sync_to_fpsimd(struct task_struct *task);
 extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 758ae984ff97..522b925a78c1 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -109,7 +109,7 @@ struct user_hwdebug_state {
 	}		dbg_regs[16];
 };
 
-/* SVE/FP/SIMD state (NT_ARM_SVE) */
+/* SVE/FP/SIMD state (NT_ARM_SVE & NT_ARM_SSVE) */
 
 struct user_sve_header {
 	__u32 size; /* total meaningful regset content in bytes */
@@ -220,6 +220,7 @@ struct user_sve_header {
 	(SVE_PT_SVE_PREG_OFFSET(vq, __SVE_NUM_PREGS) - \
 		SVE_PT_SVE_PREGS_OFFSET(vq))
 
+/* For streaming mode SVE (SSVE) FFR must be read and written as zero */
 #define SVE_PT_SVE_FFR_OFFSET(vq) \
 	(SVE_PT_REGS_OFFSET + __SVE_FFR_OFFSET(vq))
 
@@ -240,10 +241,12 @@ struct user_sve_header {
 			- SVE_PT_SVE_OFFSET + (__SVE_VQ_BYTES - 1))	\
 		/ __SVE_VQ_BYTES * __SVE_VQ_BYTES)
 
-#define SVE_PT_SIZE(vq, flags)						\
-	 (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ?		\
-		  SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags)	\
-		: SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags))
+#define SVE_PT_SIZE(vq, flags)						  \
+	 (((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ?		  \
+		  SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, flags)	  \
+		: ((((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD ?  \
+		    SVE_PT_FPSIMD_OFFSET + SVE_PT_FPSIMD_SIZE(vq, flags) \
+		  : SVE_PT_REGS_OFFSET)))
 
 /* pointer authentication masks (NT_ARM_PAC_MASK) */
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 80f7ca12f855..94f06e9d37cf 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -643,7 +643,7 @@ static void fpsimd_to_sve(struct task_struct *task)
 	if (!system_supports_sve())
 		return;
 
-	vq = sve_vq_from_vl(task_get_sve_vl(task));
+	vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread));
 	__fpsimd_to_sve(sst, fst, vq);
 }
 
@@ -660,7 +660,7 @@ static void fpsimd_to_sve(struct task_struct *task)
  */
 static void sve_to_fpsimd(struct task_struct *task)
 {
-	unsigned int vq;
+	unsigned int vq, vl;
 	void const *sst = task->thread.sve_state;
 	struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state;
 	unsigned int i;
@@ -669,7 +669,8 @@ static void sve_to_fpsimd(struct task_struct *task)
 	if (!system_supports_sve())
 		return;
 
-	vq = sve_vq_from_vl(task_get_sve_vl(task));
+	vl = thread_get_cur_vl(&task->thread);
+	vq = sve_vq_from_vl(vl);
 	for (i = 0; i < SVE_NUM_ZREGS; ++i) {
 		p = (__uint128_t const *)ZREG(sst, vq, i);
 		fst->vregs[i] = arm64_le128_to_cpu(*p);
@@ -717,6 +718,19 @@ void sve_alloc(struct task_struct *task)
 }
 
 
+/*
+ * Force the FPSIMD state shared with SVE to be updated in the SVE state
+ * even if the SVE state is the current active state.
+ *
+ * This should only be called by ptrace.  task must be non-runnable.
+ * task->thread.sve_state must point to at least sve_state_size(task)
+ * bytes of allocated kernel memory.
+ */
+void fpsimd_force_sync_to_sve(struct task_struct *task)
+{
+	fpsimd_to_sve(task);
+}
+
 /*
  * Ensure that task->thread.sve_state is up to date with respect to
  * the user task, irrespective of when SVE is in use or not.
@@ -727,7 +741,8 @@ void sve_alloc(struct task_struct *task)
  */
 void fpsimd_sync_to_sve(struct task_struct *task)
 {
-	if (!test_tsk_thread_flag(task, TIF_SVE))
+	if (!test_tsk_thread_flag(task, TIF_SVE) &&
+	    !thread_sm_enabled(&task->thread))
 		fpsimd_to_sve(task);
 }
 
@@ -741,7 +756,8 @@ void fpsimd_sync_to_sve(struct task_struct *task)
  */
 void sve_sync_to_fpsimd(struct task_struct *task)
 {
-	if (test_tsk_thread_flag(task, TIF_SVE))
+	if (test_tsk_thread_flag(task, TIF_SVE) ||
+	    thread_sm_enabled(&task->thread))
 		sve_to_fpsimd(task);
 }
 
@@ -766,7 +782,7 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
 	if (!test_tsk_thread_flag(task, TIF_SVE))
 		return;
 
-	vq = sve_vq_from_vl(task_get_sve_vl(task));
+	vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread));
 
 	memset(sst, 0, SVE_SIG_REGS_SIZE(vq));
 	__fpsimd_to_sve(sst, fst, vq);
@@ -810,8 +826,7 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 	/*
 	 * To ensure the FPSIMD bits of the SVE vector registers are preserved,
 	 * write any live register state back to task_struct, and convert to a
-	 * regular FPSIMD thread.  Since the vector length can only be changed
-	 * with a syscall we can't be in streaming mode while reconfiguring.
+	 * regular FPSIMD thread.
 	 */
 	if (task == current) {
 		get_cpu_fpsimd_context();
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 230a47b9189e..60185c27b394 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -713,21 +713,51 @@ static int system_call_set(struct task_struct *target,
 #ifdef CONFIG_ARM64_SVE
 
 static void sve_init_header_from_task(struct user_sve_header *header,
-				      struct task_struct *target)
+				      struct task_struct *target,
+				      enum vec_type type)
 {
 	unsigned int vq;
+	bool active;
+	bool fpsimd_only;
+	enum vec_type task_type;
 
 	memset(header, 0, sizeof(*header));
 
-	header->flags = test_tsk_thread_flag(target, TIF_SVE) ?
-		SVE_PT_REGS_SVE : SVE_PT_REGS_FPSIMD;
-	if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
-		header->flags |= SVE_PT_VL_INHERIT;
+	/* Check if the requested registers are active for the task */
+	if (thread_sm_enabled(&target->thread))
+		task_type = ARM64_VEC_SME;
+	else
+		task_type = ARM64_VEC_SVE;
+	active = (task_type == type);
+
+	switch (type) {
+	case ARM64_VEC_SVE:
+		if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
+			header->flags |= SVE_PT_VL_INHERIT;
+		fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE);
+		break;
+	case ARM64_VEC_SME:
+		if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT))
+			header->flags |= SVE_PT_VL_INHERIT;
+		fpsimd_only = false;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
 
-	header->vl = task_get_sve_vl(target);
+	if (active) {
+		if (fpsimd_only) {
+			header->flags |= SVE_PT_REGS_FPSIMD;
+		} else {
+			header->flags |= SVE_PT_REGS_SVE;
+		}
+	}
+
+	header->vl = task_get_vl(target, type);
 	vq = sve_vq_from_vl(header->vl);
 
-	header->max_vl = sve_max_vl();
+	header->max_vl = vec_max_vl(type);
 	header->size = SVE_PT_SIZE(vq, header->flags);
 	header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
 				      SVE_PT_REGS_SVE);
@@ -738,19 +768,17 @@ static unsigned int sve_size_from_header(struct user_sve_header const *header)
 	return ALIGN(header->size, SVE_VQ_BYTES);
 }
 
-static int sve_get(struct task_struct *target,
-		   const struct user_regset *regset,
-		   struct membuf to)
+static int sve_get_common(struct task_struct *target,
+			  const struct user_regset *regset,
+			  struct membuf to,
+			  enum vec_type type)
 {
 	struct user_sve_header header;
 	unsigned int vq;
 	unsigned long start, end;
 
-	if (!system_supports_sve())
-		return -EINVAL;
-
 	/* Header */
-	sve_init_header_from_task(&header, target);
+	sve_init_header_from_task(&header, target, type);
 	vq = sve_vq_from_vl(header.vl);
 
 	membuf_write(&to, &header, sizeof(header));
@@ -758,49 +786,61 @@ static int sve_get(struct task_struct *target,
 	if (target == current)
 		fpsimd_preserve_current_state();
 
-	/* Registers: FPSIMD-only case */
-
 	BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
-	if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD)
+	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
+
+	switch ((header.flags & SVE_PT_REGS_MASK)) {
+	case SVE_PT_REGS_FPSIMD:
 		return __fpr_get(target, regset, to);
 
-	/* Otherwise: full SVE case */
+	case SVE_PT_REGS_SVE:
+		start = SVE_PT_SVE_OFFSET;
+		end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
+		membuf_write(&to, target->thread.sve_state, end - start);
 
-	BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
-	start = SVE_PT_SVE_OFFSET;
-	end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
-	membuf_write(&to, target->thread.sve_state, end - start);
+		start = end;
+		end = SVE_PT_SVE_FPSR_OFFSET(vq);
+		membuf_zero(&to, end - start);
 
-	start = end;
-	end = SVE_PT_SVE_FPSR_OFFSET(vq);
-	membuf_zero(&to, end - start);
+		/*
+		 * Copy fpsr, and fpcr which must follow contiguously in
+		 * struct fpsimd_state:
+		 */
+		start = end;
+		end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
+		membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr,
+			     end - start);
 
-	/*
-	 * Copy fpsr, and fpcr which must follow contiguously in
-	 * struct fpsimd_state:
-	 */
-	start = end;
-	end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
-	membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, end - start);
+		start = end;
+		end = sve_size_from_header(&header);
+		return membuf_zero(&to, end - start);
 
-	start = end;
-	end = sve_size_from_header(&header);
-	return membuf_zero(&to, end - start);
+	default:
+		return 0;
+	}
 }
 
-static int sve_set(struct task_struct *target,
+static int sve_get(struct task_struct *target,
 		   const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   const void *kbuf, const void __user *ubuf)
+		   struct membuf to)
+{
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	return sve_get_common(target, regset, to, ARM64_VEC_SVE);
+}
+
+static int sve_set_common(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf,
+			  enum vec_type type)
 {
 	int ret;
 	struct user_sve_header header;
 	unsigned int vq;
 	unsigned long start, end;
 
-	if (!system_supports_sve())
-		return -EINVAL;
-
 	/* Header */
 	if (count < sizeof(header))
 		return -EINVAL;
@@ -813,13 +853,37 @@ static int sve_set(struct task_struct *target,
 	 * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
 	 * vec_set_vector_length(), which will also validate them for us:
 	 */
-	ret = vec_set_vector_length(target, ARM64_VEC_SVE, header.vl,
+	ret = vec_set_vector_length(target, type, header.vl,
 		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
 	if (ret)
 		goto out;
 
 	/* Actual VL set may be less than the user asked for: */
-	vq = sve_vq_from_vl(task_get_sve_vl(target));
+	vq = sve_vq_from_vl(task_get_vl(target, type));
+
+	/* Enter/exit streaming mode */
+	if (system_supports_sme()) {
+		u64 old_svcr = target->thread.svcr;
+
+		switch (type) {
+		case ARM64_VEC_SVE:
+			target->thread.svcr &= ~SYS_SVCR_EL0_SM_MASK;
+			break;
+		case ARM64_VEC_SME:
+			target->thread.svcr |= SYS_SVCR_EL0_SM_MASK;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+
+		/*
+		 * If we switched then invalidate any existing SVE
+		 * state and ensure there's storage.
+		 */
+		if (target->thread.svcr != old_svcr)
+			sve_alloc(target);
+	}
 
 	/* Registers: FPSIMD-only case */
 
@@ -828,10 +892,15 @@ static int sve_set(struct task_struct *target,
 		ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
 				SVE_PT_FPSIMD_OFFSET);
 		clear_tsk_thread_flag(target, TIF_SVE);
+		if (type == ARM64_VEC_SME)
+			fpsimd_force_sync_to_sve(target);
 		goto out;
 	}
 
-	/* Otherwise: full SVE case */
+	/*
+	 * Otherwise: no registers or full SVE case.  For backwards
+	 * compatibility reasons we treat empty flags as SVE registers.
+	 */
 
 	/*
 	 * If setting a different VL from the requested VL and there is
@@ -852,8 +921,9 @@ static int sve_set(struct task_struct *target,
 
 	/*
 	 * Ensure target->thread.sve_state is up to date with target's
-	 * FPSIMD regs, so that a short copyin leaves trailing registers
-	 * unmodified.
+	 * FPSIMD regs, so that a short copyin leaves trailing
+	 * registers unmodified.  Always enable SVE even if going into
+	 * streaming mode.
 	 */
 	fpsimd_sync_to_sve(target);
 	set_tsk_thread_flag(target, TIF_SVE);
@@ -889,8 +959,46 @@ out:
 	return ret;
 }
 
+static int sve_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	return sve_set_common(target, regset, pos, count, kbuf, ubuf,
+			      ARM64_VEC_SVE);
+}
+
 #endif /* CONFIG_ARM64_SVE */
 
+#ifdef CONFIG_ARM64_SME
+
+static int ssve_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   struct membuf to)
+{
+	if (!system_supports_sme())
+		return -EINVAL;
+
+	return sve_get_common(target, regset, to, ARM64_VEC_SME);
+}
+
+static int ssve_set(struct task_struct *target,
+		    const struct user_regset *regset,
+		    unsigned int pos, unsigned int count,
+		    const void *kbuf, const void __user *ubuf)
+{
+	if (!system_supports_sme())
+		return -EINVAL;
+
+	return sve_set_common(target, regset, pos, count, kbuf, ubuf,
+			      ARM64_VEC_SME);
+}
+
+#endif /* CONFIG_ARM64_SME */
+
 #ifdef CONFIG_ARM64_PTR_AUTH
 static int pac_mask_get(struct task_struct *target,
 			const struct user_regset *regset,
@@ -1108,6 +1216,9 @@ enum aarch64_regset {
 #ifdef CONFIG_ARM64_SVE
 	REGSET_SVE,
 #endif
+#ifdef CONFIG_ARM64_SVE
+	REGSET_SSVE,
+#endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	REGSET_PAC_MASK,
 	REGSET_PAC_ENABLED_KEYS,
@@ -1188,6 +1299,17 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = sve_set,
 	},
 #endif
+#ifdef CONFIG_ARM64_SME
+	[REGSET_SSVE] = { /* Streaming mode SVE */
+		.core_note_type = NT_ARM_SSVE,
+		.n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE),
+				  SVE_VQ_BYTES),
+		.size = SVE_VQ_BYTES,
+		.align = SVE_VQ_BYTES,
+		.regset_get = ssve_get,
+		.set = ssve_set,
+	},
+#endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	[REGSET_PAC_MASK] = {
 		.core_note_type = NT_ARM_PAC_MASK,
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 787c657bfae8..a8dc688e1826 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -431,6 +431,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_PACG_KEYS	0x408	/* ARM pointer authentication generic key */
 #define NT_ARM_TAGGED_ADDR_CTRL	0x409	/* arm64 tagged address control (prctl()) */
 #define NT_ARM_PAC_ENABLED_KEYS	0x40a	/* arm64 ptr auth enabled keys (prctl()) */
+#define NT_ARM_SSVE	0x40b		/* ARM Streaming SVE registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3


From 776b4a1cf36411e96972455ca72906b722b80ea1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 19 Apr 2022 12:22:29 +0100
Subject: arm64/sme: Add ptrace support for ZA

The ZA array can be read and written with the NT_ARM_ZA.  Similarly to
our interface for the SVE vector registers the regset consists of a
header with information on the current vector length followed by an
optional register data payload, represented as for signals as a series
of horizontal vectors from 0 to VL/8 in the endianness independent
format used for vectors.

On get if ZA is enabled then register data will be provided, otherwise
it will be omitted.  On set if register data is provided then ZA is
enabled and initialized using the provided data, otherwise it is
disabled.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20220419112247.711548-22-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/ptrace.h |  56 ++++++++++++++
 arch/arm64/kernel/ptrace.c           | 144 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h             |   1 +
 3 files changed, 201 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 522b925a78c1..7fa2f7036aa7 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -268,6 +268,62 @@ struct user_pac_generic_keys {
 	__uint128_t	apgakey;
 };
 
+/* ZA state (NT_ARM_ZA) */
+
+struct user_za_header {
+	__u32 size; /* total meaningful regset content in bytes */
+	__u32 max_size; /* maxmium possible size for this thread */
+	__u16 vl; /* current vector length */
+	__u16 max_vl; /* maximum possible vector length */
+	__u16 flags;
+	__u16 __reserved;
+};
+
+/*
+ * Common ZA_PT_* flags:
+ * These must be kept in sync with prctl interface in <linux/prctl.h>
+ */
+#define ZA_PT_VL_INHERIT		((1 << 17) /* PR_SME_VL_INHERIT */ >> 16)
+#define ZA_PT_VL_ONEXEC			((1 << 18) /* PR_SME_SET_VL_ONEXEC */ >> 16)
+
+
+/*
+ * The remainder of the ZA state follows struct user_za_header.  The
+ * total size of the ZA state (including header) depends on the
+ * metadata in the header:  ZA_PT_SIZE(vq, flags) gives the total size
+ * of the state in bytes, including the header.
+ *
+ * Refer to <asm/sigcontext.h> for details of how to pass the correct
+ * "vq" argument to these macros.
+ */
+
+/* Offset from the start of struct user_za_header to the register data */
+#define ZA_PT_ZA_OFFSET						\
+	((sizeof(struct user_za_header) + (__SVE_VQ_BYTES - 1))	\
+		/ __SVE_VQ_BYTES * __SVE_VQ_BYTES)
+
+/*
+ * The payload starts at offset ZA_PT_ZA_OFFSET, and is of size
+ * ZA_PT_ZA_SIZE(vq, flags).
+ *
+ * The ZA array is stored as a sequence of horizontal vectors ZAV of SVL/8
+ * bytes each, starting from vector 0.
+ *
+ * Additional data might be appended in the future.
+ *
+ * The ZA matrix is represented in memory in an endianness-invariant layout
+ * which differs from the layout used for the FPSIMD V-registers on big-endian
+ * systems: see sigcontext.h for more explanation.
+ */
+
+#define ZA_PT_ZAV_OFFSET(vq, n) \
+	(ZA_PT_ZA_OFFSET + ((vq * __SVE_VQ_BYTES) * n))
+
+#define ZA_PT_ZA_SIZE(vq) ((vq * __SVE_VQ_BYTES) * (vq * __SVE_VQ_BYTES))
+
+#define ZA_PT_SIZE(vq)						\
+	(ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq))
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 60185c27b394..47d8a7472171 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -997,6 +997,141 @@ static int ssve_set(struct task_struct *target,
 			      ARM64_VEC_SME);
 }
 
+static int za_get(struct task_struct *target,
+		  const struct user_regset *regset,
+		  struct membuf to)
+{
+	struct user_za_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sme())
+		return -EINVAL;
+
+	/* Header */
+	memset(&header, 0, sizeof(header));
+
+	if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT))
+		header.flags |= ZA_PT_VL_INHERIT;
+
+	header.vl = task_get_sme_vl(target);
+	vq = sve_vq_from_vl(header.vl);
+	header.max_vl = sme_max_vl();
+	header.max_size = ZA_PT_SIZE(vq);
+
+	/* If ZA is not active there is only the header */
+	if (thread_za_enabled(&target->thread))
+		header.size = ZA_PT_SIZE(vq);
+	else
+		header.size = ZA_PT_ZA_OFFSET;
+
+	membuf_write(&to, &header, sizeof(header));
+
+	BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header));
+	end = ZA_PT_ZA_OFFSET;
+
+	if (target == current)
+		fpsimd_preserve_current_state();
+
+	/* Any register data to include? */
+	if (thread_za_enabled(&target->thread)) {
+		start = end;
+		end = ZA_PT_SIZE(vq);
+		membuf_write(&to, target->thread.za_state, end - start);
+	}
+
+	/* Zero any trailing padding */
+	start = end;
+	end = ALIGN(header.size, SVE_VQ_BYTES);
+	return membuf_zero(&to, end - start);
+}
+
+static int za_set(struct task_struct *target,
+		  const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct user_za_header header;
+	unsigned int vq;
+	unsigned long start, end;
+
+	if (!system_supports_sme())
+		return -EINVAL;
+
+	/* Header */
+	if (count < sizeof(header))
+		return -EINVAL;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
+				 0, sizeof(header));
+	if (ret)
+		goto out;
+
+	/*
+	 * All current ZA_PT_* flags are consumed by
+	 * vec_set_vector_length(), which will also validate them for
+	 * us:
+	 */
+	ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl,
+		((unsigned long)header.flags) << 16);
+	if (ret)
+		goto out;
+
+	/* Actual VL set may be less than the user asked for: */
+	vq = sve_vq_from_vl(task_get_sme_vl(target));
+
+	/* Ensure there is some SVE storage for streaming mode */
+	if (!target->thread.sve_state) {
+		sve_alloc(target);
+		if (!target->thread.sve_state) {
+			clear_thread_flag(TIF_SME);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* Allocate/reinit ZA storage */
+	sme_alloc(target);
+	if (!target->thread.za_state) {
+		ret = -ENOMEM;
+		clear_tsk_thread_flag(target, TIF_SME);
+		goto out;
+	}
+
+	/* If there is no data then disable ZA */
+	if (!count) {
+		target->thread.svcr &= ~SYS_SVCR_EL0_ZA_MASK;
+		goto out;
+	}
+
+	/*
+	 * If setting a different VL from the requested VL and there is
+	 * register data, the data layout will be wrong: don't even
+	 * try to set the registers in this case.
+	 */
+	if (vq != sve_vq_from_vl(header.vl)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header));
+	start = ZA_PT_ZA_OFFSET;
+	end = ZA_PT_SIZE(vq);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 target->thread.za_state,
+				 start, end);
+	if (ret)
+		goto out;
+
+	/* Mark ZA as active and let userspace use it */
+	set_tsk_thread_flag(target, TIF_SME);
+	target->thread.svcr |= SYS_SVCR_EL0_ZA_MASK;
+
+out:
+	fpsimd_flush_task_state(target);
+	return ret;
+}
+
 #endif /* CONFIG_ARM64_SME */
 
 #ifdef CONFIG_ARM64_PTR_AUTH
@@ -1218,6 +1353,7 @@ enum aarch64_regset {
 #endif
 #ifdef CONFIG_ARM64_SVE
 	REGSET_SSVE,
+	REGSET_ZA,
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	REGSET_PAC_MASK,
@@ -1309,6 +1445,14 @@ static const struct user_regset aarch64_regsets[] = {
 		.regset_get = ssve_get,
 		.set = ssve_set,
 	},
+	[REGSET_ZA] = { /* SME ZA */
+		.core_note_type = NT_ARM_ZA,
+		.n = DIV_ROUND_UP(ZA_PT_ZA_SIZE(SVE_VQ_MAX), SVE_VQ_BYTES),
+		.size = SVE_VQ_BYTES,
+		.align = SVE_VQ_BYTES,
+		.regset_get = za_get,
+		.set = za_set,
+	},
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	[REGSET_PAC_MASK] = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index a8dc688e1826..97808f958903 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -432,6 +432,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_TAGGED_ADDR_CTRL	0x409	/* arm64 tagged address control (prctl()) */
 #define NT_ARM_PAC_ENABLED_KEYS	0x40a	/* arm64 ptr auth enabled keys (prctl()) */
 #define NT_ARM_SSVE	0x40b		/* ARM Streaming SVE registers */
+#define NT_ARM_ZA	0x40c		/* ARM SME ZA registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3


From 26f89535a5bb17915a2e1062c3999a2ee797c7b0 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Wed, 13 Apr 2022 22:12:46 -0700
Subject: cxl/mbox: Use type __u32 for mailbox payload sizes

Payload sizes for mailbox commands are expected to be positive values
coming from userspace. The documentation correctly describes these as
always unsigned values. The mailbox and send structures that support
the mailbox commands however, use __s32 types for the payloads.

Replace  __s32 with __u32 in the mailbox and send command structures
and update usages.

Kernel users of the interface already block all negative values and
there is no known ability for userspace to have grown a dependency on
submitting negative values to the kernel. The known user of the IOCTL,
the CXL command line interface (cxl-cli) already enforces positive
size values.

A Smatch warning of a signedness uncovered this issue.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Link: https://lore.kernel.org/r/20220414051246.1244575-1-alison.schofield@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      | 28 +++++++++++++++-------------
 include/uapi/linux/cxl_mem.h | 14 +++++++-------
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 8a8388599a85..d54a6d175fff 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -35,6 +35,7 @@ static bool cxl_raw_allow_all;
 	.flags = _flags,                                                       \
 	}
 
+#define CXL_VARIABLE_PAYLOAD	~0U
 /*
  * This table defines the supported mailbox commands for the driver. This table
  * is made up of a UAPI structure. Non-negative values as parameters in the
@@ -44,26 +45,26 @@ static bool cxl_raw_allow_all;
 static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
 #ifdef CONFIG_CXL_MEM_RAW_COMMANDS
-	CXL_CMD(RAW, ~0, ~0, 0),
+	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
 #endif
-	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
 	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
 	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
-	CXL_CMD(GET_LSA, 0x8, ~0, 0),
+	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
-	CXL_CMD(GET_LOG, 0x18, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
 	CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
-	CXL_CMD(SET_LSA, ~0, 0, 0),
+	CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0),
 	CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0),
 	CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
 	CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
 	CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
-	CXL_CMD(GET_POISON, 0x10, ~0, 0),
+	CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(INJECT_POISON, 0x8, 0, 0),
 	CXL_CMD(CLEAR_POISON, 0x48, 0, 0),
 	CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
 	CXL_CMD(SCAN_MEDIA, 0x11, 0, 0),
-	CXL_CMD(GET_SCAN_MEDIA, 0, ~0, 0),
+	CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0),
 };
 
 /*
@@ -187,9 +188,10 @@ int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in,
 	 * Variable sized commands can't be validated and so it's up to the
 	 * caller to do that if they wish.
 	 */
-	if (cmd->info.size_out >= 0 && mbox_cmd.size_out != out_size)
-		return -EIO;
-
+	if (cmd->info.size_out != CXL_VARIABLE_PAYLOAD) {
+		if (mbox_cmd.size_out != out_size)
+			return -EIO;
+	}
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mbox_send_cmd, CXL);
@@ -275,7 +277,7 @@ static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox,
 	}
 
 	/* Prepare to handle a full payload for variable sized output */
-	if (out_size < 0)
+	if (out_size == CXL_VARIABLE_PAYLOAD)
 		mbox->size_out = cxlds->payload_size;
 	else
 		mbox->size_out = out_size;
@@ -353,11 +355,11 @@ static int cxl_to_mem_cmd(struct cxl_mem_command *mem_cmd,
 		return -EBUSY;
 
 	/* Check the input buffer is the expected size */
-	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
+	if (info->size_in != send_cmd->in.size)
 		return -ENOMEM;
 
 	/* Check the output buffer is at least large enough */
-	if (info->size_out >= 0 && send_cmd->out.size < info->size_out)
+	if (send_cmd->out.size < info->size_out)
 		return -ENOMEM;
 
 	*mem_cmd = (struct cxl_mem_command) {
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 8d206f27bb6d..c71021a2a9ed 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -68,8 +68,8 @@ static const struct {
  * struct cxl_command_info - Command information returned from a query.
  * @id: ID number for the command.
  * @flags: Flags that specify command behavior.
- * @size_in: Expected input size, or -1 if variable length.
- * @size_out: Expected output size, or -1 if variable length.
+ * @size_in: Expected input size, or ~0 if variable length.
+ * @size_out: Expected output size, or ~0 if variable length.
  *
  * Represents a single command that is supported by both the driver and the
  * hardware. This is returned as part of an array from the query ioctl. The
@@ -78,7 +78,7 @@ static const struct {
  *
  *  - @id = 10
  *  - @flags = 0
- *  - @size_in = -1
+ *  - @size_in = ~0
  *  - @size_out = 0
  *
  * See struct cxl_mem_query_commands.
@@ -89,8 +89,8 @@ struct cxl_command_info {
 	__u32 flags;
 #define CXL_MEM_COMMAND_FLAG_MASK GENMASK(0, 0)
 
-	__s32 size_in;
-	__s32 size_out;
+	__u32 size_in;
+	__u32 size_out;
 };
 
 /**
@@ -169,13 +169,13 @@ struct cxl_send_command {
 	__u32 retval;
 
 	struct {
-		__s32 size;
+		__u32 size;
 		__u32 rsvd;
 		__u64 payload;
 	} in;
 
 	struct {
-		__s32 size;
+		__u32 size;
 		__u32 rsvd;
 		__u64 payload;
 	} out;
-- 
cgit v1.2.3


From 567f882a401346779d05a90beb8f21865ebdd398 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 9 Mar 2022 10:55:43 +0000
Subject: media: cec.h: add cec_msg_recv_is_rx/tx_result helpers

These two helper functions return true if the received message
contains the result of a previous non-blocking transmit. Either
the tx_status result (cec_msg_recv_is_tx_result) of the transmit,
or the rx_status result (cec_msg_recv_is_rx_result) of the reply
to the original transmit.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/cec.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index de936f5e446d..1d48da926216 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -142,6 +142,26 @@ static inline void cec_msg_set_reply_to(struct cec_msg *msg,
 	msg->reply = msg->timeout = 0;
 }
 
+/**
+ * cec_msg_recv_is_tx_result - return true if this message contains the
+ *			       result of an earlier non-blocking transmit
+ * @msg:	the message structure from CEC_RECEIVE
+ */
+static inline int cec_msg_recv_is_tx_result(const struct cec_msg *msg)
+{
+	return msg->sequence && msg->tx_status && !msg->rx_status;
+}
+
+/**
+ * cec_msg_recv_is_rx_result - return true if this message contains the
+ *			       reply of an earlier non-blocking transmit
+ * @msg:	the message structure from CEC_RECEIVE
+ */
+static inline int cec_msg_recv_is_rx_result(const struct cec_msg *msg)
+{
+	return msg->sequence && !msg->tx_status && msg->rx_status;
+}
+
 /* cec_msg flags field */
 #define CEC_MSG_FL_REPLY_TO_FOLLOWERS	(1 << 0)
 #define CEC_MSG_FL_RAW			(1 << 1)
-- 
cgit v1.2.3


From 4e4dab4bb6029dbee63f12a249ddc44b0124ea63 Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Wed, 2 Mar 2022 22:03:01 +0000
Subject: media: media.h: Add new media link type

To describe in the kernel the connection between devices and their
supporting peripherals (for example, a camera sensor and the vcm
driving the focusing lens for it), add a new type of media link
to introduce the concept of these ancillary links.

Add some elements to the uAPI documentation to explain the new link
type, their purpose and some aspects of their current implementation.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Jean-Michel Hautbois <jeanmichel.hautbois@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../media/mediactl/media-controller-model.rst           |  6 ++++++
 .../userspace-api/media/mediactl/media-types.rst        | 17 ++++++++++++-----
 include/uapi/linux/media.h                              |  1 +
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/mediactl/media-controller-model.rst b/Documentation/userspace-api/media/mediactl/media-controller-model.rst
index 222cb99debb5..78bfdfb2a322 100644
--- a/Documentation/userspace-api/media/mediactl/media-controller-model.rst
+++ b/Documentation/userspace-api/media/mediactl/media-controller-model.rst
@@ -33,3 +33,9 @@ are:
 
 -  An **interface link** is a point-to-point bidirectional control
    connection between a Linux Kernel interface and an entity.
+
+- An **ancillary link** is a point-to-point connection denoting that two
+  entities form a single logical unit. For example this could represent the
+  fact that a particular camera sensor and lens controller form a single
+  physical module, meaning this lens controller drives the lens for this
+  camera sensor.
\ No newline at end of file
diff --git a/Documentation/userspace-api/media/mediactl/media-types.rst b/Documentation/userspace-api/media/mediactl/media-types.rst
index 0a26397bd01d..0ffeece1e0c8 100644
--- a/Documentation/userspace-api/media/mediactl/media-types.rst
+++ b/Documentation/userspace-api/media/mediactl/media-types.rst
@@ -412,14 +412,21 @@ must be set for every pad.
 	  is set by drivers and is read-only for applications.
 
     *  -  ``MEDIA_LNK_FL_LINK_TYPE``
-       -  This is a bitmask that defines the type of the link. Currently,
-	  two types of links are supported:
+       -  This is a bitmask that defines the type of the link. The following
+	  link types are currently supported:
 
 	  .. _MEDIA-LNK-FL-DATA-LINK:
 
-	  ``MEDIA_LNK_FL_DATA_LINK`` if the link is between two pads
+	  ``MEDIA_LNK_FL_DATA_LINK`` for links that represent a data connection
+	  between two pads.
 
 	  .. _MEDIA-LNK-FL-INTERFACE-LINK:
 
-	  ``MEDIA_LNK_FL_INTERFACE_LINK`` if the link is between an
-	  interface and an entity
+	  ``MEDIA_LNK_FL_INTERFACE_LINK`` for links that associate an entity to its
+	  interface.
+
+	  .. _MEDIA-LNK-FL-ANCILLARY-LINK:
+
+	  ``MEDIA_LNK_FL_ANCILLARY_LINK`` for links that represent a physical
+	  relationship between two entities. The link may or may not be
+	  immutable, so applications must not assume either case.
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 200fa8462b90..afbae7213d35 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -226,6 +226,7 @@ struct media_pad_desc {
 #define MEDIA_LNK_FL_LINK_TYPE			(0xf << 28)
 #  define MEDIA_LNK_FL_DATA_LINK		(0 << 28)
 #  define MEDIA_LNK_FL_INTERFACE_LINK		(1 << 28)
+#  define MEDIA_LNK_FL_ANCILLARY_LINK		(2 << 28)
 
 struct media_link_desc {
 	struct media_pad_desc source;
-- 
cgit v1.2.3


From 3d22dd432889f2f538b53f36f9f6bcd54825fc22 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Mon, 28 Mar 2022 17:01:53 +0100
Subject: media: media.h: remove unneeded <stdint.h> inclusion

Commit b3b7a9f138b7 ("[media] media-device: Use u64 ints for pointers")
added this #include <stdint.h>, presumably in order to use uintptr_t.

Now that it is gone, we can compile this for userspace without <stdint.h>.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/media.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index afbae7213d35..3ddadaea849f 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -20,9 +20,6 @@
 #ifndef __LINUX_MEDIA_H
 #define __LINUX_MEDIA_H
 
-#ifndef __KERNEL__
-#include <stdint.h>
-#endif
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From 2308d5aff8d083a44aa02197d2f5687b73d98f82 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Thu, 3 Mar 2022 15:06:31 +0000
Subject: media: v4l: Add Qualcomm custom compressed pixel formats

Add custom Qualcomm raw compressed pixel formats. They are
used in Qualcomm SoCs to optimize the interconnect bandwidth.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-reserved.rst       | 19 +++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                  |  2 ++
 include/uapi/linux/videodev2.h                        |  2 ++
 3 files changed, 23 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst b/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
index cabfa34b7db5..0ff68cd8cf62 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
@@ -239,6 +239,25 @@ please make a proposal on the linux-media mailing list.
 	It remains an opaque intermediate format and the MDP hardware must be
 	used to convert ``V4L2_PIX_FMT_MT21C`` to ``V4L2_PIX_FMT_NV12M``,
 	``V4L2_PIX_FMT_YUV420M`` or ``V4L2_PIX_FMT_YVU420``.
+    * .. _V4L2-PIX-FMT-QC08C:
+
+      - ``V4L2_PIX_FMT_QC08C``
+      - 'QC08C'
+      - Compressed Macro-tile 8-Bit YUV420 format used by Qualcomm platforms.
+        It is an opaque intermediate format. The used compression is lossless
+        and it is used by various multimedia hardware blocks like GPU, display
+        controllers, ISP and video accelerators.
+        It contains four planes for progressive video and eight planes for
+        interlaced video.
+    * .. _V4L2-PIX-FMT-QC10C:
+
+      - ``V4L2_PIX_FMT_QC10C``
+      - 'QC10C'
+      - Compressed Macro-tile 10-Bit YUV420 format used by Qualcomm platforms.
+        It is an opaque intermediate format. The used compression is lossless
+        and it is used by various multimedia hardware blocks like GPU, display
+        controllers, ISP and video accelerators.
+        It contains four planes for progressive video.
 .. raw:: latex
 
     \normalsize
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index db5947fbd9a9..e2636539c9db 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1445,6 +1445,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 		case V4L2_PIX_FMT_SE401:	descr = "GSPCA SE401"; break;
 		case V4L2_PIX_FMT_S5C_UYVY_JPG:	descr = "S5C73MX interleaved UYVY/JPEG"; break;
 		case V4L2_PIX_FMT_MT21C:	descr = "Mediatek Compressed Format"; break;
+		case V4L2_PIX_FMT_QC08C:	descr = "QCOM Compressed 8-bit Format"; break;
+		case V4L2_PIX_FMT_QC10C:	descr = "QCOM Compressed 10-bit Format"; break;
 		default:
 			if (fmt->description[0])
 				return;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 3768a0a80830..6d465dc443b7 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -746,6 +746,8 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_INZI     v4l2_fourcc('I', 'N', 'Z', 'I') /* Intel Planar Greyscale 10-bit and Depth 16-bit */
 #define V4L2_PIX_FMT_CNF4     v4l2_fourcc('C', 'N', 'F', '4') /* Intel 4-bit packed depth confidence information */
 #define V4L2_PIX_FMT_HI240    v4l2_fourcc('H', 'I', '2', '4') /* BTTV 8-bit dithered RGB */
+#define V4L2_PIX_FMT_QC08C    v4l2_fourcc('Q', '0', '8', 'C') /* Qualcomm 8-bit compressed */
+#define V4L2_PIX_FMT_QC10C    v4l2_fourcc('Q', '1', '0', 'C') /* Qualcomm 10-bit compressed */
 
 /* 10bit raw bayer packed, 32 bytes for every 25 pixels, last LSB 6 bits unused */
 #define V4L2_PIX_FMT_IPU3_SBGGR10	v4l2_fourcc('i', 'p', '3', 'b') /* IPU3 packed 10-bit BGGR bayer */
-- 
cgit v1.2.3


From fcbc4acf8b8dff5fc420a14026bd4ab1798cf465 Mon Sep 17 00:00:00 2001
From: Dikshita Agarwal <quic_dikshita@quicinc.com>
Date: Tue, 19 Apr 2022 06:06:42 +0100
Subject: media: v4l2-ctrls: Add intra-refresh type control

Add a control to set intra-refresh type.

Signed-off-by: Dikshita Agarwal <quic_dikshita@quicinc.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 22 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls-defs.c          |  9 +++++++++
 include/uapi/linux/v4l2-controls.h                 |  5 +++++
 3 files changed, 36 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 4cd7c541fc30..c24977fa7329 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -1180,6 +1180,28 @@ enum v4l2_mpeg_video_h264_entropy_mode -
     is set to non zero value.
     Applicable to H264, H263 and MPEG4 encoder.
 
+``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE (enum)``
+
+enum v4l2_mpeg_video_intra_refresh_period_type -
+    Sets the type of intra refresh. The period to refresh
+    the whole frame is specified by V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD.
+    Note that if this control is not present, then it is undefined what
+    refresh type is used and it is up to the driver to decide.
+    Applicable to H264 and HEVC encoders. Possible values are:
+
+.. tabularcolumns:: |p{9.6cm}|p{7.9cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - ``V4L2_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_RANDOM``
+      - The whole frame is completely refreshed randomly
+      after the specified period.
+    * - ``V4L2_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_CYCLIC``
+      - The whole frame MBs are completely refreshed in cyclic order
+      after the specified period.
+
 ``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD (integer)``
     Intra macroblock refresh period. This sets the period to refresh
     the whole frame. In other words, this defines the number of frames
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index 54ca4e6b820b..16f42d2fd359 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -572,6 +572,11 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		"VBV/CPB Limit",
 		NULL,
 	};
+	static const char * const intra_refresh_period_type[] = {
+		"Random",
+		"Cyclic",
+		NULL,
+	};
 
 	switch (id) {
 	case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ:
@@ -705,6 +710,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		return hevc_start_code;
 	case V4L2_CID_CAMERA_ORIENTATION:
 		return camera_orientation;
+	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE:
+		return intra_refresh_period_type;
 	default:
 		return NULL;
 	}
@@ -834,6 +841,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE:	return "Decoder Slice Interface";
 	case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER:	return "MPEG4 Loop Filter Enable";
 	case V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB:	return "Number of Intra Refresh MBs";
+	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE:	return "Intra Refresh Period Type";
 	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD:		return "Intra Refresh Period";
 	case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE:		return "Frame Level Rate Control Enable";
 	case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE:			return "H264 MB Level Rate Control";
@@ -1360,6 +1368,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_STATELESS_H264_DECODE_MODE:
 	case V4L2_CID_STATELESS_H264_START_CODE:
 	case V4L2_CID_CAMERA_ORIENTATION:
+	case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE:
 		*type = V4L2_CTRL_TYPE_MENU;
 		break;
 	case V4L2_CID_LINK_FREQ:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index bb40129446d4..dfff69ed88f7 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -449,6 +449,11 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES		(V4L2_CID_CODEC_BASE+234)
 #define V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR		(V4L2_CID_CODEC_BASE+235)
 #define V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD	(V4L2_CID_CODEC_BASE+236)
+#define V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE	(V4L2_CID_CODEC_BASE+237)
+enum v4l2_mpeg_video_intra_refresh_period_type {
+	V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_RANDOM	= 0,
+	V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_CYCLIC	= 1,
+};
 
 /* CIDs for the MPEG-2 Part 2 (H.262) codec */
 #define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL			(V4L2_CID_CODEC_BASE+270)
-- 
cgit v1.2.3


From 8e29da69feade64ec7fe9e1a2824b967c5183a21 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:00 -0600
Subject: io_uring: add support for IORING_ASYNC_CANCEL_ALL

The current cancelation will lookup and cancel the first request it
finds based on the key passed in. Add a flag that allows to cancel any
request that matches they key. It completes with the number of requests
found and canceled, or res < 0 if an error occured.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-4-axboe@kernel.dk
---
 fs/io-wq.h                    |  1 +
 fs/io_uring.c                 | 91 +++++++++++++++++++++++++++++++++----------
 include/uapi/linux/io_uring.h |  7 ++++
 3 files changed, 78 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..ba6eee76d028 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 struct io_wq_work {
 	struct io_wq_work_node list;
 	unsigned flags;
+	int cancel_seq;
 };
 
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b72323c3fe9b..b43cdf1a4555 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -403,6 +403,7 @@ struct io_ring_ctx {
 		 */
 		struct io_rsrc_node	*rsrc_node;
 		int			rsrc_cached_refs;
+		atomic_t		cancel_seq;
 		struct io_file_table	file_table;
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
@@ -585,6 +586,7 @@ struct io_sync {
 struct io_cancel {
 	struct file			*file;
 	u64				addr;
+	u32				flags;
 };
 
 struct io_timeout {
@@ -991,6 +993,8 @@ struct io_defer_entry {
 struct io_cancel_data {
 	struct io_ring_ctx *ctx;
 	u64 data;
+	u32 flags;
+	int seq;
 };
 
 struct io_op_def {
@@ -1726,6 +1730,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 
 	req->work.list.next = NULL;
 	req->work.flags = 0;
+	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 	if (req->flags & REQ_F_FORCE_ASYNC)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
@@ -6159,6 +6164,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	int v;
 
 	INIT_HLIST_NODE(&req->hash_node);
+	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 	io_init_poll_iocb(poll, mask, io_poll_wake);
 	poll->file = req->file;
 
@@ -6316,6 +6322,11 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 			continue;
 		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 			continue;
+		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+		}
 		return req;
 	}
 	return NULL;
@@ -6501,9 +6512,15 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		found = cd->data == req->cqe.user_data;
-		if (found)
-			break;
+		if (cd->data != req->cqe.user_data)
+			continue;
+		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+		}
+		found = true;
+		break;
 	}
 	if (!found)
 		return ERR_PTR(-ENOENT);
@@ -6777,7 +6794,16 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_cancel_data *cd = data;
 
-	return req->ctx == cd->ctx && req->cqe.user_data == cd->data;
+	if (req->ctx != cd->ctx)
+		return false;
+	if (req->cqe.user_data != cd->data)
+		return false;
+	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+		if (cd->seq == req->work.cancel_seq)
+			return false;
+		req->work.cancel_seq = cd->seq;
+	}
+	return true;
 }
 
 static int io_async_cancel_one(struct io_uring_task *tctx,
@@ -6789,7 +6815,8 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 	if (!tctx || !tctx->io_wq)
 		return -ENOENT;
 
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, false);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd,
+					cd->flags & IORING_ASYNC_CANCEL_ALL);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -6837,27 +6864,33 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 		return -EINVAL;
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
-	    sqe->splice_fd_in)
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
+	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
+	if (req->cancel.flags & ~IORING_ASYNC_CANCEL_ALL)
+		return -EINVAL;
+
 	return 0;
 }
 
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+			     unsigned int issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cancel_data cd = {
-		.ctx		= ctx,
-		.data		= req->cancel.addr,
-	};
+	bool cancel_all = cd->flags & IORING_ASYNC_CANCEL_ALL;
+	struct io_ring_ctx *ctx = cd->ctx;
 	struct io_tctx_node *node;
-	int ret;
+	int ret, nr = 0;
 
-	ret = io_try_cancel(req, &cd);
-	if (ret != -ENOENT)
-		goto done;
+	do {
+		ret = io_try_cancel(req, cd);
+		if (ret == -ENOENT)
+			break;
+		if (!cancel_all)
+			return ret;
+		nr++;
+	} while (1);
 
 	/* slow path, try all io-wq's */
 	io_ring_submit_lock(ctx, issue_flags);
@@ -6865,12 +6898,28 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
 
-		ret = io_async_cancel_one(tctx, &cd);
-		if (ret != -ENOENT)
-			break;
+		ret = io_async_cancel_one(tctx, cd);
+		if (ret != -ENOENT) {
+			if (!cancel_all)
+				break;
+			nr++;
+		}
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
-done:
+	return cancel_all ? nr : ret;
+}
+
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_cancel_data cd = {
+		.ctx	= req->ctx,
+		.data	= req->cancel.addr,
+		.flags	= req->cancel.flags,
+		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
+	};
+	int ret;
+
+	ret = __io_async_cancel(&cd, req, issue_flags);
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete_post(req, ret, 0);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1845cf7c80ba..476e58a2837f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -187,6 +187,13 @@ enum {
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
 
+/*
+ * ASYNC_CANCEL flags.
+ *
+ * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
+ */
+#define IORING_ASYNC_CANCEL_ALL	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
-- 
cgit v1.2.3


From 4bf94615b8886305199ed5755cb72fea88258d15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:01 -0600
Subject: io_uring: allow IORING_OP_ASYNC_CANCEL with 'fd' key

Currently sqe->addr must contain the user_data of the request being
canceled. Introduce the IORING_ASYNC_CANCEL_FD flag, which tells the
kernel that we're keying off the file fd instead for cancelation. This
allows canceling any request that a) uses a file, and b) was assigned the
file based on the value being passed in.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-5-axboe@kernel.dk
---
 fs/io_uring.c                 | 67 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |  3 ++
 2 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b43cdf1a4555..cf0d5437b77d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -587,6 +587,7 @@ struct io_cancel {
 	struct file			*file;
 	u64				addr;
 	u32				flags;
+	s32				fd;
 };
 
 struct io_timeout {
@@ -992,7 +993,10 @@ struct io_defer_entry {
 
 struct io_cancel_data {
 	struct io_ring_ctx *ctx;
-	u64 data;
+	union {
+		u64 data;
+		struct file *file;
+	};
 	u32 flags;
 	int seq;
 };
@@ -6332,6 +6336,29 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 	return NULL;
 }
 
+static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
+					  struct io_cancel_data *cd)
+	__must_hold(&ctx->completion_lock)
+{
+	struct io_kiocb *req;
+	int i;
+
+	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+		struct hlist_head *list;
+
+		list = &ctx->cancel_hash[i];
+		hlist_for_each_entry(req, list, hash_node) {
+			if (req->file != cd->file)
+				continue;
+			if (cd->seq == req->work.cancel_seq)
+				continue;
+			req->work.cancel_seq = cd->seq;
+			return req;
+		}
+	}
+	return NULL;
+}
+
 static bool io_poll_disarm(struct io_kiocb *req)
 	__must_hold(&ctx->completion_lock)
 {
@@ -6345,8 +6372,12 @@ static bool io_poll_disarm(struct io_kiocb *req)
 static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 	__must_hold(&ctx->completion_lock)
 {
-	struct io_kiocb *req = io_poll_find(ctx, false, cd);
+	struct io_kiocb *req;
 
+	if (cd->flags & IORING_ASYNC_CANCEL_FD)
+		req = io_poll_file_find(ctx, cd);
+	else
+		req = io_poll_find(ctx, false, cd);
 	if (!req)
 		return -ENOENT;
 	io_poll_cancel_req(req);
@@ -6796,8 +6827,13 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 
 	if (req->ctx != cd->ctx)
 		return false;
-	if (req->cqe.user_data != cd->data)
-		return false;
+	if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->file != cd->file)
+			return false;
+	} else {
+		if (req->cqe.user_data != cd->data)
+			return false;
+	}
 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
 		if (cd->seq == req->work.cancel_seq)
 			return false;
@@ -6851,7 +6887,8 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
 	ret = io_poll_cancel(ctx, cd);
 	if (ret != -ENOENT)
 		goto out;
-	ret = io_timeout_cancel(ctx, cd);
+	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
+		ret = io_timeout_cancel(ctx, cd);
 out:
 	spin_unlock(&ctx->completion_lock);
 	return ret;
@@ -6862,15 +6899,17 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 {
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
+	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
 		return -EINVAL;
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
 	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
-	if (req->cancel.flags & ~IORING_ASYNC_CANCEL_ALL)
+	if (req->cancel.flags & ~(IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_FD))
 		return -EINVAL;
+	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD)
+		req->cancel.fd = READ_ONCE(sqe->fd);
 
 	return 0;
 }
@@ -6919,7 +6958,21 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	};
 	int ret;
 
+	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->flags & REQ_F_FIXED_FILE)
+			req->file = io_file_get_fixed(req, req->cancel.fd,
+							issue_flags);
+		else
+			req->file = io_file_get_normal(req, req->cancel.fd);
+		if (!req->file) {
+			ret = -EBADF;
+			goto done;
+		}
+		cd.file = req->file;
+	}
+
 	ret = __io_async_cancel(&cd, req, issue_flags);
+done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete_post(req, ret, 0);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 476e58a2837f..cc7fe82a1798 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -191,8 +191,11 @@ enum {
  * ASYNC_CANCEL flags.
  *
  * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
+ * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
+ *				request 'user_data'
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
+#define IORING_ASYNC_CANCEL_FD	(1U << 1)
 
 /*
  * IO completion data structure (Completion Queue Entry)
-- 
cgit v1.2.3


From 970f256edb8c1259c8ed48d52b38215135396126 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Apr 2022 10:44:02 -0600
Subject: io_uring: add support for IORING_ASYNC_CANCEL_ANY

Rather than match on a specific key, be it user_data or file, allow
canceling any request that we can lookup. Works like
IORING_ASYNC_CANCEL_ALL in that it cancels multiple requests, but it
doesn't key off user_data or the file.

Can't be set with IORING_ASYNC_CANCEL_FD, as that's a key selector.
Only one may be used at the time.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20220418164402.75259-6-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 39 +++++++++++++++++++++++++--------------
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf0d5437b77d..03134ec070a3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6348,7 +6348,8 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
 
 		list = &ctx->cancel_hash[i];
 		hlist_for_each_entry(req, list, hash_node) {
-			if (req->file != cd->file)
+			if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+			    req->file != cd->file)
 				continue;
 			if (cd->seq == req->work.cancel_seq)
 				continue;
@@ -6374,7 +6375,7 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
 	struct io_kiocb *req;
 
-	if (cd->flags & IORING_ASYNC_CANCEL_FD)
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
 		req = io_poll_file_find(ctx, cd);
 	else
 		req = io_poll_find(ctx, false, cd);
@@ -6543,9 +6544,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	bool found = false;
 
 	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-		if (cd->data != req->cqe.user_data)
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+		    cd->data != req->cqe.user_data)
 			continue;
-		if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
 			if (cd->seq == req->work.cancel_seq)
 				continue;
 			req->work.cancel_seq = cd->seq;
@@ -6827,14 +6829,16 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 
 	if (req->ctx != cd->ctx)
 		return false;
-	if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+	if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+		;
+	} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
 		if (req->file != cd->file)
 			return false;
 	} else {
 		if (req->cqe.user_data != cd->data)
 			return false;
 	}
-	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+	if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
 		if (cd->seq == req->work.cancel_seq)
 			return false;
 		req->work.cancel_seq = cd->seq;
@@ -6847,12 +6851,13 @@ static int io_async_cancel_one(struct io_uring_task *tctx,
 {
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
+	bool all;
 
 	if (!tctx || !tctx->io_wq)
 		return -ENOENT;
 
-	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd,
-					cd->flags & IORING_ASYNC_CANCEL_ALL);
+	all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -6894,6 +6899,9 @@ out:
 	return ret;
 }
 
+#define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
+			 IORING_ASYNC_CANCEL_ANY)
+
 static int io_async_cancel_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
@@ -6906,10 +6914,13 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 
 	req->cancel.addr = READ_ONCE(sqe->addr);
 	req->cancel.flags = READ_ONCE(sqe->cancel_flags);
-	if (req->cancel.flags & ~(IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_FD))
+	if (req->cancel.flags & ~CANCEL_FLAGS)
 		return -EINVAL;
-	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD)
+	if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
+		if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
+			return -EINVAL;
 		req->cancel.fd = READ_ONCE(sqe->fd);
+	}
 
 	return 0;
 }
@@ -6917,7 +6928,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 			     unsigned int issue_flags)
 {
-	bool cancel_all = cd->flags & IORING_ASYNC_CANCEL_ALL;
+	bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
 	struct io_ring_ctx *ctx = cd->ctx;
 	struct io_tctx_node *node;
 	int ret, nr = 0;
@@ -6926,7 +6937,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 		ret = io_try_cancel(req, cd);
 		if (ret == -ENOENT)
 			break;
-		if (!cancel_all)
+		if (!all)
 			return ret;
 		nr++;
 	} while (1);
@@ -6939,13 +6950,13 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
 
 		ret = io_async_cancel_one(tctx, cd);
 		if (ret != -ENOENT) {
-			if (!cancel_all)
+			if (!all)
 				break;
 			nr++;
 		}
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
-	return cancel_all ? nr : ret;
+	return all ? nr : ret;
 }
 
 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cc7fe82a1798..980d82eb196e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -193,9 +193,11 @@ enum {
  * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
  * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
  *				request 'user_data'
+ * IORING_ASYNC_CANCEL_ANY	Match any request
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
+#define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 
 /*
  * IO completion data structure (Completion Queue Entry)
-- 
cgit v1.2.3


From e9621e2bec80fe63f677a759066a5089b292f43a Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Wed, 23 Mar 2022 08:44:19 -0700
Subject: io_uring: add fsetxattr and setxattr support

This adds support to io_uring for the fsetxattr and setxattr API.

Signed-off-by: Stefan Roesch <shr@fb.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20220323154420.3301504-4-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 165 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |   6 +-
 2 files changed, 170 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e57d47a23682..bb812d007d30 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -80,6 +80,7 @@
 #include <linux/io_uring.h>
 #include <linux/audit.h>
 #include <linux/security.h>
+#include <linux/xattr.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -779,6 +780,12 @@ struct io_async_rw {
 	struct wait_page_queue		wpq;
 };
 
+struct io_xattr {
+	struct file			*file;
+	struct xattr_ctx		ctx;
+	struct filename			*filename;
+};
+
 enum {
 	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
 	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
@@ -943,6 +950,7 @@ struct io_kiocb {
 		struct io_symlink	symlink;
 		struct io_hardlink	hardlink;
 		struct io_msg		msg;
+		struct io_xattr		xattr;
 	};
 
 	u8				opcode;
@@ -1211,6 +1219,10 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
 	},
+	[IORING_OP_FSETXATTR] = {
+		.needs_file = 1
+	},
+	[IORING_OP_SETXATTR] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -4190,6 +4202,144 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static inline void __io_xattr_finish(struct io_kiocb *req)
+{
+	struct io_xattr *ix = &req->xattr;
+
+	if (ix->filename)
+		putname(ix->filename);
+
+	kfree(ix->ctx.kname);
+	kvfree(ix->ctx.kvalue);
+}
+
+static void io_xattr_finish(struct io_kiocb *req, int ret)
+{
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+
+	__io_xattr_finish(req);
+	if (ret < 0)
+		req_set_fail(req);
+
+	io_req_complete(req, ret);
+}
+
+static int __io_setxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *name;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->ioprio))
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ix->filename = NULL;
+	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ix->ctx.kvalue = NULL;
+	ix->ctx.size = READ_ONCE(sqe->len);
+	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+	if (!ix->ctx.kname)
+		return -ENOMEM;
+
+	ret = setxattr_copy(name, &ix->ctx);
+	if (ret) {
+		kfree(ix->ctx.kname);
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_setxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *path;
+	int ret;
+
+	ret = __io_setxattr_prep(req, sqe);
+	if (ret)
+		return ret;
+
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(ix->filename)) {
+		ret = PTR_ERR(ix->filename);
+		ix->filename = NULL;
+	}
+
+	return ret;
+}
+
+static int io_fsetxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	return __io_setxattr_prep(req, sqe);
+}
+
+static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
+			struct path *path)
+{
+	struct io_xattr *ix = &req->xattr;
+	int ret;
+
+	ret = mnt_want_write(path->mnt);
+	if (!ret) {
+		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+		mnt_drop_write(path->mnt);
+	}
+
+	return ret;
+}
+
+static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+	io_xattr_finish(req, ret);
+
+	return 0;
+}
+
+static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = &req->xattr;
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+retry:
+	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+	if (!ret) {
+		ret = __io_setxattr(req, issue_flags, &path);
+		path_put(&path);
+		if (retry_estale(ret, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+
+	io_xattr_finish(req, ret);
+	return 0;
+}
+
 static int io_unlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -7151,6 +7301,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_linkat_prep(req, sqe);
 	case IORING_OP_MSG_RING:
 		return io_msg_ring_prep(req, sqe);
+	case IORING_OP_FSETXATTR:
+		return io_fsetxattr_prep(req, sqe);
+	case IORING_OP_SETXATTR:
+		return io_setxattr_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -7295,6 +7449,10 @@ static void io_clean_op(struct io_kiocb *req)
 			if (req->statx.filename)
 				putname(req->statx.filename);
 			break;
+		case IORING_OP_SETXATTR:
+		case IORING_OP_FSETXATTR:
+			__io_xattr_finish(req);
+			break;
 		}
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -7451,6 +7609,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_MSG_RING:
 		ret = io_msg_ring(req, issue_flags);
 		break;
+	case IORING_OP_FSETXATTR:
+		ret = io_fsetxattr(req, issue_flags);
+		break;
+	case IORING_OP_SETXATTR:
+		ret = io_setxattr(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -12012,6 +12176,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
+	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
 		     sizeof(struct io_uring_rsrc_update));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 980d82eb196e..864bd6a4d4ff 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -45,6 +45,7 @@ struct io_uring_sqe {
 		__u32		rename_flags;
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
+		__u32		xattr_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -60,7 +61,8 @@ struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 	};
-	__u64	__pad2[2];
+	__u64	addr3;
+	__u64	__pad2[1];
 };
 
 enum {
@@ -145,6 +147,8 @@ enum {
 	IORING_OP_SYMLINKAT,
 	IORING_OP_LINKAT,
 	IORING_OP_MSG_RING,
+	IORING_OP_FSETXATTR,
+	IORING_OP_SETXATTR,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3


From a56834e0fafe0adf7f22a28a5dbec3e8c3031a0e Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Wed, 23 Mar 2022 08:44:20 -0700
Subject: io_uring: add fgetxattr and getxattr support

This adds support to io_uring for the fgetxattr and getxattr API.

Signed-off-by: Stefan Roesch <shr@fb.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20220323154420.3301504-5-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 129 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |   2 +
 2 files changed, 131 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bb812d007d30..2221fe01b1c4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1223,6 +1223,10 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file = 1
 	},
 	[IORING_OP_SETXATTR] = {},
+	[IORING_OP_FGETXATTR] = {
+		.needs_file = 1
+	},
+	[IORING_OP_GETXATTR] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -4224,6 +4228,119 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 	io_req_complete(req, ret);
 }
 
+static int __io_getxattr_prep(struct io_kiocb *req,
+			      const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *name;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->ioprio))
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ix->filename = NULL;
+	ix->ctx.kvalue = NULL;
+	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ix->ctx.size = READ_ONCE(sqe->len);
+	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+	if (ix->ctx.flags)
+		return -EINVAL;
+
+	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+	if (!ix->ctx.kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(ix->ctx.kname->name, name,
+				sizeof(ix->ctx.kname->name));
+	if (!ret || ret == sizeof(ix->ctx.kname->name))
+		ret = -ERANGE;
+	if (ret < 0) {
+		kfree(ix->ctx.kname);
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_fgetxattr_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+	return __io_getxattr_prep(req, sqe);
+}
+
+static int io_getxattr_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *path;
+	int ret;
+
+	ret = __io_getxattr_prep(req, sqe);
+	if (ret)
+		return ret;
+
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(ix->filename)) {
+		ret = PTR_ERR(ix->filename);
+		ix->filename = NULL;
+	}
+
+	return ret;
+}
+
+static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = &req->xattr;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
+			req->file->f_path.dentry,
+			&ix->ctx);
+
+	io_xattr_finish(req, ret);
+	return 0;
+}
+
+static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = &req->xattr;
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+retry:
+	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+	if (!ret) {
+		ret = do_getxattr(mnt_user_ns(path.mnt),
+				path.dentry,
+				&ix->ctx);
+
+		path_put(&path);
+		if (retry_estale(ret, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+
+	io_xattr_finish(req, ret);
+	return 0;
+}
+
 static int __io_setxattr_prep(struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
@@ -7305,6 +7422,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_fsetxattr_prep(req, sqe);
 	case IORING_OP_SETXATTR:
 		return io_setxattr_prep(req, sqe);
+	case IORING_OP_FGETXATTR:
+		return io_fgetxattr_prep(req, sqe);
+	case IORING_OP_GETXATTR:
+		return io_getxattr_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -7451,6 +7572,8 @@ static void io_clean_op(struct io_kiocb *req)
 			break;
 		case IORING_OP_SETXATTR:
 		case IORING_OP_FSETXATTR:
+		case IORING_OP_GETXATTR:
+		case IORING_OP_FGETXATTR:
 			__io_xattr_finish(req);
 			break;
 		}
@@ -7615,6 +7738,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_SETXATTR:
 		ret = io_setxattr(req, issue_flags);
 		break;
+	case IORING_OP_FGETXATTR:
+		ret = io_fgetxattr(req, issue_flags);
+		break;
+	case IORING_OP_GETXATTR:
+		ret = io_getxattr(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 864bd6a4d4ff..8ca1d9ae56d6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -149,6 +149,8 @@ enum {
 	IORING_OP_MSG_RING,
 	IORING_OP_FSETXATTR,
 	IORING_OP_SETXATTR,
+	IORING_OP_FGETXATTR,
+	IORING_OP_GETXATTR,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3


From 1374e08e2d44863c931910797852589803997668 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Apr 2022 14:22:40 -0600
Subject: io_uring: add socket(2) support

Supports both regular socket(2) where a normal file descriptor is
instantiated when called, or direct descriptors.

Link: https://lore.kernel.org/r/20220412202240.234207-3-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 76 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 77 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2221fe01b1c4..bf95ef9240e5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -576,6 +576,16 @@ struct io_accept {
 	unsigned long			nofile;
 };
 
+struct io_socket {
+	struct file			*file;
+	int				domain;
+	int				type;
+	int				protocol;
+	int				flags;
+	u32				file_slot;
+	unsigned long			nofile;
+};
+
 struct io_sync {
 	struct file			*file;
 	loff_t				len;
@@ -951,6 +961,7 @@ struct io_kiocb {
 		struct io_hardlink	hardlink;
 		struct io_msg		msg;
 		struct io_xattr		xattr;
+		struct io_socket	sock;
 	};
 
 	u8				opcode;
@@ -1227,6 +1238,9 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file = 1
 	},
 	[IORING_OP_GETXATTR] = {},
+	[IORING_OP_SOCKET] = {
+		.audit_skip		= 1,
+	},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -6017,6 +6031,62 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_socket *sock = &req->sock;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index)
+		return -EINVAL;
+
+	sock->domain = READ_ONCE(sqe->fd);
+	sock->type = READ_ONCE(sqe->off);
+	sock->protocol = READ_ONCE(sqe->len);
+	sock->file_slot = READ_ONCE(sqe->file_index);
+	sock->nofile = rlimit(RLIMIT_NOFILE);
+
+	sock->flags = sock->type & ~SOCK_TYPE_MASK;
+	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
+		return -EINVAL;
+	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	return 0;
+}
+
+static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_socket *sock = &req->sock;
+	bool fixed = !!sock->file_slot;
+	struct file *file;
+	int ret, fd;
+
+	if (!fixed) {
+		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
+		if (unlikely(fd < 0))
+			return fd;
+	}
+	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
+	if (IS_ERR(file)) {
+		if (!fixed)
+			put_unused_fd(fd);
+		ret = PTR_ERR(file);
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return -EAGAIN;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		req_set_fail(req);
+	} else if (!fixed) {
+		fd_install(fd, file);
+		ret = fd;
+	} else {
+		ret = io_install_fixed_file(req, file, issue_flags,
+					    sock->file_slot - 1);
+	}
+	__io_req_complete(req, issue_flags, ret, 0);
+	return 0;
+}
+
 static int io_connect_prep_async(struct io_kiocb *req)
 {
 	struct io_async_connect *io = req->async_data;
@@ -6105,6 +6175,7 @@ IO_NETOP_PREP_ASYNC(sendmsg);
 IO_NETOP_PREP_ASYNC(recvmsg);
 IO_NETOP_PREP_ASYNC(connect);
 IO_NETOP_PREP(accept);
+IO_NETOP_PREP(socket);
 IO_NETOP_FN(send);
 IO_NETOP_FN(recv);
 #endif /* CONFIG_NET */
@@ -7426,6 +7497,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_fgetxattr_prep(req, sqe);
 	case IORING_OP_GETXATTR:
 		return io_getxattr_prep(req, sqe);
+	case IORING_OP_SOCKET:
+		return io_socket_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -7744,6 +7817,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_GETXATTR:
 		ret = io_getxattr(req, issue_flags);
 		break;
+	case IORING_OP_SOCKET:
+		ret = io_socket(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8ca1d9ae56d6..5fb52bf32435 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -151,6 +151,7 @@ enum {
 	IORING_OP_SETXATTR,
 	IORING_OP_FGETXATTR,
 	IORING_OP_GETXATTR,
+	IORING_OP_SOCKET,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3


From 8d92e4fbcf0fb7ecb24223b7b1ce95b9beb4dfa2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Apr 2022 06:44:21 +0300
Subject: devlink: introduce line card devices support

Line card can contain one or more devices that makes sense to make
visible to the user. For example, this can be a gearbox with
flash memory, which could be updated.

Provide the driver possibility to attach such devices to a line card
and expose those to user.

Example:
$ devlink lc show pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8 state active type 16x100G
    supported_types:
      16x100G
    devices:
      device 0
      device 1
      device 2
      device 3

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |   7 +++
 include/uapi/linux/devlink.h |   3 ++
 net/core/devlink.c           | 104 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2a2a2a0c93f7..c84b52fb9ff0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1578,6 +1578,13 @@ struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
+struct devlink_linecard_device;
+struct devlink_linecard_device *
+devlink_linecard_device_create(struct devlink_linecard *linecard,
+			       unsigned int device_index);
+void
+devlink_linecard_device_destroy(struct devlink_linecard *linecard,
+				struct devlink_linecard_device *linecard_device);
 void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b3d40a5d72ff..cd578645f94f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -575,6 +575,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_LINECARD_STATE,		/* u8 */
 	DEVLINK_ATTR_LINECARD_TYPE,		/* string */
 	DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES,	/* nested */
+	DEVLINK_ATTR_LINECARD_DEVICE_LIST,	/* nested */
+	DEVLINK_ATTR_LINECARD_DEVICE,		/* nested */
+	DEVLINK_ATTR_LINECARD_DEVICE_INDEX,	/* u32 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 5cc88490f18f..41d9631ceada 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -83,10 +83,11 @@ struct devlink_linecard {
 	const struct devlink_linecard_ops *ops;
 	void *priv;
 	enum devlink_linecard_state state;
-	struct mutex state_lock; /* Protects state */
+	struct mutex state_lock; /* Protects state and device_list */
 	const char *type;
 	struct devlink_linecard_type *types;
 	unsigned int types_count;
+	struct list_head device_list;
 };
 
 /**
@@ -2058,6 +2059,55 @@ struct devlink_linecard_type {
 	const void *priv;
 };
 
+struct devlink_linecard_device {
+	struct list_head list;
+	unsigned int index;
+};
+
+static int
+devlink_nl_linecard_device_fill(struct sk_buff *msg,
+				struct devlink_linecard_device *linecard_device)
+{
+	struct nlattr *attr;
+
+	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE);
+	if (!attr)
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX,
+			linecard_device->index)) {
+		nla_nest_cancel(msg, attr);
+		return -EMSGSIZE;
+	}
+	nla_nest_end(msg, attr);
+
+	return 0;
+}
+
+static int devlink_nl_linecard_devices_fill(struct sk_buff *msg,
+					    struct devlink_linecard *linecard)
+{
+	struct devlink_linecard_device *linecard_device;
+	struct nlattr *attr;
+	int err;
+
+	if (list_empty(&linecard->device_list))
+		return 0;
+
+	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST);
+	if (!attr)
+		return -EMSGSIZE;
+	list_for_each_entry(linecard_device, &linecard->device_list, list) {
+		err = devlink_nl_linecard_device_fill(msg, linecard_device);
+		if (err) {
+			nla_nest_cancel(msg, attr);
+			return err;
+		}
+	}
+	nla_nest_end(msg, attr);
+
+	return 0;
+}
+
 static int devlink_nl_linecard_fill(struct sk_buff *msg,
 				    struct devlink *devlink,
 				    struct devlink_linecard *linecard,
@@ -2068,6 +2118,7 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 	struct devlink_linecard_type *linecard_type;
 	struct nlattr *attr;
 	void *hdr;
+	int err;
 	int i;
 
 	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -2100,6 +2151,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		nla_nest_end(msg, attr);
 	}
 
+	err = devlink_nl_linecard_devices_fill(msg, linecard);
+	if (err)
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -10264,6 +10319,7 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 	linecard->priv = priv;
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	mutex_init(&linecard->state_lock);
+	INIT_LIST_HEAD(&linecard->device_list);
 
 	err = devlink_linecard_types_init(linecard);
 	if (err) {
@@ -10291,6 +10347,7 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard)
 	struct devlink *devlink = linecard->devlink;
 
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+	WARN_ON(!list_empty(&linecard->device_list));
 	mutex_lock(&devlink->linecards_lock);
 	list_del(&linecard->list);
 	devlink_linecard_types_fini(linecard);
@@ -10299,6 +10356,50 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_destroy);
 
+/**
+ *	devlink_linecard_device_create - Create a device on linecard
+ *
+ *	@linecard: devlink linecard
+ *	@device_index: index of the linecard device
+ *
+ *	Return: Line card device structure or an ERR_PTR() encoded error code.
+ */
+struct devlink_linecard_device *
+devlink_linecard_device_create(struct devlink_linecard *linecard,
+			       unsigned int device_index)
+{
+	struct devlink_linecard_device *linecard_device;
+
+	linecard_device = kzalloc(sizeof(*linecard_device), GFP_KERNEL);
+	if (!linecard_device)
+		return ERR_PTR(-ENOMEM);
+	linecard_device->index = device_index;
+	mutex_lock(&linecard->state_lock);
+	list_add_tail(&linecard_device->list, &linecard->device_list);
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+	return linecard_device;
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_device_create);
+
+/**
+ *	devlink_linecard_device_destroy - Destroy device on linecard
+ *
+ *	@linecard: devlink linecard
+ *	@linecard_device: devlink linecard device
+ */
+void
+devlink_linecard_device_destroy(struct devlink_linecard *linecard,
+				struct devlink_linecard_device *linecard_device)
+{
+	mutex_lock(&linecard->state_lock);
+	list_del(&linecard_device->list);
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+	kfree(linecard_device);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_device_destroy);
+
 /**
  *	devlink_linecard_provision_set - Set provisioning on linecard
  *
@@ -10331,6 +10432,7 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
+	WARN_ON(!list_empty(&linecard->device_list));
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	linecard->type = NULL;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
-- 
cgit v1.2.3


From 276910aecc6a4076f5fbfd8160ff70695d6c1eb5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Apr 2022 06:44:22 +0300
Subject: devlink: introduce line card info get message

Allow the driver to provide per line card info get op to fill-up info,
similar to the "devlink dev info".

Example:

$ devlink lc info pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8
    versions:
        fixed:
          hw.revision 0
        running:
          ini.version 4

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../networking/devlink/devlink-linecard.rst        |   4 +
 include/net/devlink.h                              |   7 +-
 include/uapi/linux/devlink.h                       |   2 +
 net/core/devlink.c                                 | 130 ++++++++++++++++++++-
 4 files changed, 138 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/devlink/devlink-linecard.rst b/Documentation/networking/devlink/devlink-linecard.rst
index 6c0b8928bc13..5a8d5989702a 100644
--- a/Documentation/networking/devlink/devlink-linecard.rst
+++ b/Documentation/networking/devlink/devlink-linecard.rst
@@ -14,6 +14,7 @@ system. Following operations are provided:
   * Get a list of supported line card types.
   * Provision of a slot with specific line card type.
   * Get and monitor of line card state and its change.
+  * Get information about line card versions.
 
 Line card according to the type may contain one or more gearboxes
 to mux the lanes with certain speed to multiple ports with lanes
@@ -120,3 +121,6 @@ Example usage
 
     # Set slot 8 to be unprovisioned:
     $ devlink lc set pci/0000:01:00.0 lc 8 notype
+
+    # Set info for slot 8:
+    $ devlink lc info pci/0000:01:00.0 lc 8
diff --git a/include/net/devlink.h b/include/net/devlink.h
index c84b52fb9ff0..f96dcb376630 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -150,6 +150,8 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
+struct devlink_info_req;
+
 /**
  * struct devlink_linecard_ops - Linecard operations
  * @provision: callback to provision the linecard slot with certain
@@ -168,6 +170,7 @@ struct devlink_port_new_attrs {
  *                  provisioned.
  * @types_count: callback to get number of supported types
  * @types_get: callback to get next type in list
+ * @info_get: callback to get linecard info
  */
 struct devlink_linecard_ops {
 	int (*provision)(struct devlink_linecard *linecard, void *priv,
@@ -182,6 +185,9 @@ struct devlink_linecard_ops {
 	void (*types_get)(struct devlink_linecard *linecard,
 			  void *priv, unsigned int index, const char **type,
 			  const void **type_priv);
+	int (*info_get)(struct devlink_linecard *linecard, void *priv,
+			struct devlink_info_req *req,
+			struct netlink_ext_ack *extack);
 };
 
 struct devlink_sb_pool_info {
@@ -628,7 +634,6 @@ struct devlink_flash_update_params {
 #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK	BIT(1)
 
 struct devlink_region;
-struct devlink_info_req;
 
 /**
  * struct devlink_region_ops - Region operations
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index cd578645f94f..fb8c3864457f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -136,6 +136,8 @@ enum devlink_command {
 	DEVLINK_CMD_LINECARD_NEW,
 	DEVLINK_CMD_LINECARD_DEL,
 
+	DEVLINK_CMD_LINECARD_INFO_GET,		/* can dump */
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 41d9631ceada..5facd10de64a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2424,6 +2424,125 @@ static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
+struct devlink_info_req {
+	struct sk_buff *msg;
+};
+
+static int
+devlink_nl_linecard_info_fill(struct sk_buff *msg, struct devlink *devlink,
+			      struct devlink_linecard *linecard,
+			      enum devlink_command cmd, u32 portid,
+			      u32 seq, int flags, struct netlink_ext_ack *extack)
+{
+	struct devlink_info_req req;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
+		goto nla_put_failure;
+
+	req.msg = msg;
+	err = linecard->ops->info_get(linecard, linecard->priv, &req, extack);
+	if (err)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int devlink_nl_cmd_linecard_info_get_doit(struct sk_buff *skb,
+						 struct genl_info *info)
+{
+	struct devlink_linecard *linecard = info->user_ptr[1];
+	struct devlink *devlink = linecard->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	if (!linecard->ops->info_get)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&linecard->state_lock);
+	err = devlink_nl_linecard_info_fill(msg, devlink, linecard,
+					    DEVLINK_CMD_LINECARD_INFO_GET,
+					    info->snd_portid, info->snd_seq, 0,
+					    info->extack);
+	mutex_unlock(&linecard->state_lock);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_linecard_info_get_dumpit(struct sk_buff *msg,
+						   struct netlink_callback *cb)
+{
+	struct devlink_linecard *linecard;
+	struct devlink *devlink;
+	int start = cb->args[0];
+	unsigned long index;
+	int idx = 0;
+	int err = 0;
+
+	mutex_lock(&devlink_mutex);
+	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+		if (!devlink_try_get(devlink))
+			continue;
+
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			goto retry;
+
+		mutex_lock(&devlink->linecards_lock);
+		list_for_each_entry(linecard, &devlink->linecard_list, list) {
+			if (idx < start || !linecard->ops->info_get) {
+				idx++;
+				continue;
+			}
+			mutex_lock(&linecard->state_lock);
+			err = devlink_nl_linecard_info_fill(msg, devlink, linecard,
+							    DEVLINK_CMD_LINECARD_INFO_GET,
+							    NETLINK_CB(cb->skb).portid,
+							    cb->nlh->nlmsg_seq,
+							    NLM_F_MULTI,
+							    cb->extack);
+			mutex_unlock(&linecard->state_lock);
+			if (err) {
+				mutex_unlock(&devlink->linecards_lock);
+				devlink_put(devlink);
+				goto out;
+			}
+			idx++;
+		}
+		mutex_unlock(&devlink->linecards_lock);
+retry:
+		devlink_put(devlink);
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	if (err != -EMSGSIZE)
+		return err;
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
 			      struct devlink_sb *devlink_sb,
 			      enum devlink_command cmd, u32 portid,
@@ -6416,10 +6535,6 @@ out_dev:
 	return err;
 }
 
-struct devlink_info_req {
-	struct sk_buff *msg;
-};
-
 int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
 {
 	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
@@ -9139,6 +9254,13 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
 	},
+	{
+		.cmd = DEVLINK_CMD_LINECARD_INFO_GET,
+		.doit = devlink_nl_cmd_linecard_info_get_doit,
+		.dumpit = devlink_nl_cmd_linecard_info_get_dumpit,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
+		/* can be retrieved by unprivileged users */
+	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-- 
cgit v1.2.3


From e8e7fbb6a39cd6761c843d97851eb40c5885e922 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Tue, 12 Apr 2022 11:49:43 +0200
Subject: PCI: Add PCI_EXP_SLTCTL_ASPL_DISABLE macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add macro defining Auto Slot Power Limit Disable bit in Slot Control
Register.

Link: https://lore.kernel.org/r/20220412094946.27069-2-pali@kernel.org
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index bee1a9ed6e66..108f8523fa04 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -616,6 +616,7 @@
 #define  PCI_EXP_SLTCTL_PWR_OFF        0x0400 /* Power Off */
 #define  PCI_EXP_SLTCTL_EIC	0x0800	/* Electromechanical Interlock Control */
 #define  PCI_EXP_SLTCTL_DLLSCE	0x1000	/* Data Link Layer State Changed Enable */
+#define  PCI_EXP_SLTCTL_ASPL_DISABLE	0x2000 /* Auto Slot Power Limit Disable */
 #define  PCI_EXP_SLTCTL_IBPD_DISABLE	0x4000 /* In-band PD disable */
 #define PCI_EXP_SLTSTA		0x1a	/* Slot Status */
 #define  PCI_EXP_SLTSTA_ABP	0x0001	/* Attention Button Pressed */
-- 
cgit v1.2.3


From 7d5e005d982527e4029b0139823d179986e34cdc Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 22 Apr 2022 15:03:25 +0300
Subject: fanotify: implement "evictable" inode marks

When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
pin the marked inode to inode cache, so when inode is evicted from cache
due to memory pressure, the mark will be lost.

When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
this flag, the marked inode is pinned to inode cache.

When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
existing mark already has the inode pinned, the mark update fails with
error EEXIST.

Evictable inode marks can be used to setup inode marks with ignored mask
to suppress events from uninteresting files or directories in a lazy
manner, upon receiving the first event, without having to iterate all
the uninteresting files or directories before hand.

The evictbale inode mark feature allows performing this lazy marks setup
without exhausting the system memory with pinned inodes.

This change does not enable the feature yet.

Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiRDpuS=2uA6+ZUM7yG9vVU-u212tkunBmSnP_u=mkv=Q@mail.gmail.com/
Link: https://lore.kernel.org/r/20220422120327.3459282-15-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.h      |  2 ++
 fs/notify/fanotify/fanotify_user.c | 38 ++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/fanotify.h      |  1 +
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 87142bc0131a..80e0ec95b113 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -497,6 +497,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 
 	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+	if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
+		mflags |= FAN_MARK_EVICTABLE;
 
 	return mflags;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4005ee8e6e2c..ae36138afead 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1084,6 +1084,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 				       unsigned int fan_flags)
 {
+	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
 	bool recalc = false;
 
 	/*
@@ -1099,7 +1100,18 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 			recalc = true;
 	}
 
-	return recalc;
+	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
+	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+		return recalc;
+
+	/*
+	 * NO_IREF may be removed from a mark, but not added.
+	 * When removed, fsnotify_recalc_mask() will take the inode ref.
+	 */
+	WARN_ON_ONCE(!want_iref);
+	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
+
+	return true;
 }
 
 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
@@ -1125,6 +1137,7 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   fsnotify_connp_t *connp,
 						   unsigned int obj_type,
+						   unsigned int fan_flags,
 						   __kernel_fsid_t *fsid)
 {
 	struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1147,6 +1160,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	}
 
 	fsnotify_init_mark(mark, group);
+	if (fan_flags & FAN_MARK_EVICTABLE)
+		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+
 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
 	if (ret) {
 		fsnotify_put_mark(mark);
@@ -1183,13 +1199,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_mark(connp, group);
 	if (!fsn_mark) {
-		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
+		fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+						 fan_flags, fsid);
 		if (IS_ERR(fsn_mark)) {
 			mutex_unlock(&group->mark_mutex);
 			return PTR_ERR(fsn_mark);
 		}
 	}
 
+	/*
+	 * Non evictable mark cannot be downgraded to evictable mark.
+	 */
+	if (fan_flags & FAN_MARK_EVICTABLE &&
+	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
 	/*
 	 * Error events are pre-allocated per group, only if strictly
 	 * needed (i.e. FAN_FS_ERROR was requested).
@@ -1601,6 +1627,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	    mark_type != FAN_MARK_FILESYSTEM)
 		goto fput_and_out;
 
+	/*
+	 * Evictable is only relevant for inode marks, because only inode object
+	 * can be evicted on memory pressure.
+	 */
+	if (flags & FAN_MARK_EVICTABLE &&
+	     mark_type != FAN_MARK_INODE)
+		goto fput_and_out;
+
 	/*
 	 * Events that do not carry enough information to report
 	 * event->fd require a group that supports reporting fid.  Those
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index e8ac38cc2fd6..f1f89132d60e 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -82,6 +82,7 @@
 #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
 #define FAN_MARK_FLUSH		0x00000080
 /* FAN_MARK_FILESYSTEM is	0x00000100 */
+#define FAN_MARK_EVICTABLE	0x00000200
 
 /* These are NOT bitwise flags.  Both bits can be used togther.  */
 #define FAN_MARK_INODE		0x00000000
-- 
cgit v1.2.3


From c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:51 +0530
Subject: bpf: Allow storing referenced kptr in map

Extending the code in previous commits, introduce referenced kptr
support, which needs to be tagged using 'kptr_ref' tag instead. Unlike
unreferenced kptr, referenced kptr have a lot more restrictions. In
addition to the type matching, only a newly introduced bpf_kptr_xchg
helper is allowed to modify the map value at that offset. This transfers
the referenced pointer being stored into the map, releasing the
references state for the program, and returning the old value and
creating new reference state for the returned pointer.

Similar to unreferenced pointer case, return value for this case will
also be PTR_TO_BTF_ID_OR_NULL. The reference for the returned pointer
must either be eventually released by calling the corresponding release
function, otherwise it must be transferred into another map.

It is also allowed to call bpf_kptr_xchg with a NULL pointer, to clear
the value, and obtain the old value if any.

BPF_LDX, BPF_STX, and BPF_ST cannot access referenced kptr. A future
commit will permit using BPF_LDX for such pointers, but attempt at
making it safe, since the lifetime of object won't be guaranteed.

There are valid reasons to enforce the restriction of permitting only
bpf_kptr_xchg to operate on referenced kptr. The pointer value must be
consistent in face of concurrent modification, and any prior values
contained in the map must also be released before a new one is moved
into the map. To ensure proper transfer of this ownership, bpf_kptr_xchg
returns the old value, which the verifier would require the user to
either free or move into another map, and releases the reference held
for the pointer being moved in.

In the future, direct BPF_XCHG instruction may also be permitted to work
like bpf_kptr_xchg helper.

Note that process_kptr_func doesn't have to call
check_helper_mem_access, since we already disallow rdonly/wronly flags
for map, which is what check_map_access_type checks, and we already
ensure the PTR_TO_MAP_VALUE refers to kptr by obtaining its off_desc,
so check_map_access is also not required.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-4-memxor@gmail.com
---
 include/linux/bpf.h            |  8 ++++
 include/uapi/linux/bpf.h       | 12 ++++++
 kernel/bpf/btf.c               | 10 ++++-
 kernel/bpf/helpers.c           | 24 +++++++++++
 kernel/bpf/verifier.c          | 98 ++++++++++++++++++++++++++++++++++++------
 tools/include/uapi/linux/bpf.h | 12 ++++++
 6 files changed, 151 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 492edd2c5713..24310837bafc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,8 +160,14 @@ enum {
 	BPF_MAP_VALUE_OFF_MAX = 8,
 };
 
+enum bpf_kptr_type {
+	BPF_KPTR_UNREF,
+	BPF_KPTR_REF,
+};
+
 struct bpf_map_value_off_desc {
 	u32 offset;
+	enum bpf_kptr_type type;
 	struct {
 		struct btf *btf;
 		u32 btf_id;
@@ -418,6 +424,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_STACK,	/* pointer to stack */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
+	ARG_PTR_TO_KPTR,	/* pointer to referenced kptr */
 	__BPF_ARG_TYPE_MAX,
 
 	/* Extended arg_types. */
@@ -427,6 +434,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_SOCKET_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
 	ARG_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
 	ARG_PTR_TO_STACK_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
+	ARG_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
 	 * wide enough to hold the higher bits reserved for bpf_type_flag.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d14b10b85e51..444fe6f1cf35 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5143,6 +5143,17 @@ union bpf_attr {
  *		The **hash_algo** is returned on success,
  *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
  *		invalid arguments are passed.
+ *
+ * void *bpf_kptr_xchg(void *map_value, void *ptr)
+ *	Description
+ *		Exchange kptr at pointer *map_value* with *ptr*, and return the
+ *		old value. *ptr* can be NULL, otherwise it must be a referenced
+ *		pointer which will be released when this helper is called.
+ *	Return
+ *		The old value of kptr (which can be NULL). The returned pointer
+ *		if not NULL, is a reference which must be released using its
+ *		corresponding release function, or moved into a BPF map before
+ *		program exit.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5339,6 +5350,7 @@ union bpf_attr {
 	FN(copy_from_user_task),	\
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
+	FN(kptr_xchg),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f0287342204f..4138c51728dd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3177,6 +3177,7 @@ enum {
 struct btf_field_info {
 	u32 type_id;
 	u32 off;
+	enum bpf_kptr_type type;
 };
 
 static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
@@ -3193,6 +3194,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
 static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 			 u32 off, int sz, struct btf_field_info *info)
 {
+	enum bpf_kptr_type type;
 	u32 res_id;
 
 	/* For PTR, sz is always == 8 */
@@ -3205,7 +3207,11 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	/* Reject extra tags */
 	if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
 		return -EINVAL;
-	if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+	if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_UNREF;
+	else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_REF;
+	else
 		return -EINVAL;
 
 	/* Get the base type */
@@ -3216,6 +3222,7 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 
 	info->type_id = res_id;
 	info->off = off;
+	info->type = type;
 	return BTF_FIELD_FOUND;
 }
 
@@ -3420,6 +3427,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 		}
 
 		tab->off[i].offset = info_arr[i].off;
+		tab->off[i].type = info_arr[i].type;
 		tab->off[i].kptr.btf_id = id;
 		tab->off[i].kptr.btf = kernel_btf;
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 315053ef6a75..3e709fed5306 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1374,6 +1374,28 @@ out:
 	kfree(t);
 }
 
+BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+{
+	unsigned long *kptr = map_value;
+
+	return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier.
+ */
+#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
+
+const struct bpf_func_proto bpf_kptr_xchg_proto = {
+	.func         = bpf_kptr_xchg,
+	.gpl_only     = false,
+	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
+	.ret_btf_id   = BPF_PTR_POISON,
+	.arg1_type    = ARG_PTR_TO_KPTR,
+	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+	.arg2_btf_id  = BPF_PTR_POISON,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1452,6 +1474,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_timer_start_proto;
 	case BPF_FUNC_timer_cancel:
 		return &bpf_timer_cancel_proto;
+	case BPF_FUNC_kptr_xchg:
+		return &bpf_kptr_xchg_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5426bab7f02c..c9ee44efed89 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -258,6 +258,7 @@ struct bpf_call_arg_meta {
 	struct btf *ret_btf;
 	u32 ret_btf_id;
 	u32 subprogno;
+	struct bpf_map_value_off_desc *kptr_off_desc;
 };
 
 struct btf *btf_vmlinux;
@@ -489,7 +490,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
 	if (func_id == BPF_FUNC_sk_lookup_tcp ||
 	    func_id == BPF_FUNC_sk_lookup_udp ||
 	    func_id == BPF_FUNC_skc_lookup_tcp ||
-	    func_id == BPF_FUNC_ringbuf_reserve)
+	    func_id == BPF_FUNC_ringbuf_reserve ||
+	    func_id == BPF_FUNC_kptr_xchg)
 		return true;
 
 	if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -3514,6 +3516,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
 	reg_name = kernel_type_name(reg->btf, reg->btf_id);
 
+	/* For ref_ptr case, release function check should ensure we get one
+	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+	 * normal store of unreferenced kptr, we must ensure var_off is zero.
+	 * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+	 * reg->off and reg->ref_obj_id are not needed here.
+	 */
 	if (__check_ptr_off_reg(env, reg, regno, true))
 		return -EACCES;
 
@@ -3569,6 +3577,12 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
+	/* We cannot directly access kptr_ref */
+	if (off_desc->type == BPF_KPTR_REF) {
+		verbose(env, "accessing referenced kptr disallowed\n");
+		return -EACCES;
+	}
+
 	if (class == BPF_LDX) {
 		val_reg = reg_state(env, value_regno);
 		/* We can simply mark the value_regno receiving the pointer
@@ -5293,6 +5307,53 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+			     struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	struct bpf_map_value_off_desc *off_desc;
+	struct bpf_map *map_ptr = reg->map_ptr;
+	u32 kptr_off;
+	int ret;
+
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env,
+			"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map_ptr->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+			map_ptr->name);
+		return -EINVAL;
+	}
+	if (!map_value_has_kptrs(map_ptr)) {
+		ret = PTR_ERR(map_ptr->kptr_off_tab);
+		if (ret == -E2BIG)
+			verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
+				BPF_MAP_VALUE_OFF_MAX);
+		else if (ret == -EEXIST)
+			verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
+		else
+			verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+		return -EINVAL;
+	}
+
+	meta->map_ptr = map_ptr;
+	kptr_off = reg->off + reg->var_off.value;
+	off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
+	if (!off_desc) {
+		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+		return -EACCES;
+	}
+	if (off_desc->type != BPF_KPTR_REF) {
+		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+		return -EACCES;
+	}
+	meta->kptr_off_desc = off_desc;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return base_type(type) == ARG_PTR_TO_MEM ||
@@ -5433,6 +5494,7 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -5460,11 +5522,13 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_STACK]		= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
+	[ARG_PTR_TO_KPTR]		= &kptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
-			  const u32 *arg_btf_id)
+			  const u32 *arg_btf_id,
+			  struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected, type = reg->type;
@@ -5517,8 +5581,11 @@ found:
 			arg_btf_id = compatible->btf_id;
 		}
 
-		if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-					  btf_vmlinux, *arg_btf_id)) {
+		if (meta->func_id == BPF_FUNC_kptr_xchg) {
+			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+				return -EACCES;
+		} else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+						 btf_vmlinux, *arg_btf_id)) {
 			verbose(env, "R%d is of type %s but %s is expected\n",
 				regno, kernel_type_name(reg->btf, reg->btf_id),
 				kernel_type_name(btf_vmlinux, *arg_btf_id));
@@ -5625,7 +5692,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		 */
 		goto skip_type_check;
 
-	err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+	err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
 	if (err)
 		return err;
 
@@ -5801,6 +5868,9 @@ skip_type_check:
 			verbose(env, "string is not zero-terminated\n");
 			return -EINVAL;
 		}
+	} else if (arg_type == ARG_PTR_TO_KPTR) {
+		if (process_kptr_func(env, regno, meta))
+			return -EACCES;
 	}
 
 	return err;
@@ -6143,10 +6213,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
-		if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
+		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
 			return false;
 
-		if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
 			return false;
 	}
 
@@ -7012,21 +7082,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
 	} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+		struct btf *ret_btf;
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
-		ret_btf_id = *fn->ret_btf_id;
+		if (func_id == BPF_FUNC_kptr_xchg) {
+			ret_btf = meta.kptr_off_desc->kptr.btf;
+			ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+		} else {
+			ret_btf = btf_vmlinux;
+			ret_btf_id = *fn->ret_btf_id;
+		}
 		if (ret_btf_id == 0) {
 			verbose(env, "invalid return type %u of func %s#%d\n",
 				base_type(ret_type), func_id_name(func_id),
 				func_id);
 			return -EINVAL;
 		}
-		/* current BPF helper definitions are only coming from
-		 * built-in code with type IDs from  vmlinux BTF
-		 */
-		regs[BPF_REG_0].btf = btf_vmlinux;
+		regs[BPF_REG_0].btf = ret_btf;
 		regs[BPF_REG_0].btf_id = ret_btf_id;
 	} else {
 		verbose(env, "unknown return type %u of func %s#%d\n",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d14b10b85e51..444fe6f1cf35 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5143,6 +5143,17 @@ union bpf_attr {
  *		The **hash_algo** is returned on success,
  *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
  *		invalid arguments are passed.
+ *
+ * void *bpf_kptr_xchg(void *map_value, void *ptr)
+ *	Description
+ *		Exchange kptr at pointer *map_value* with *ptr*, and return the
+ *		old value. *ptr* can be NULL, otherwise it must be a referenced
+ *		pointer which will be released when this helper is called.
+ *	Return
+ *		The old value of kptr (which can be NULL). The returned pointer
+ *		if not NULL, is a reference which must be released using its
+ *		corresponding release function, or moved into a BPF map before
+ *		program exit.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5339,6 +5350,7 @@ union bpf_attr {
 	FN(copy_from_user_task),	\
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
+	FN(kptr_xchg),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From d8fc1c7c4c9b705ce5f5bba772ad66a0137c685d Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Apr 2022 09:16:17 -0500
Subject: tee: remove flags TEE_IOCTL_SHM_MAPPED and TEE_IOCTL_SHM_DMA_BUF

These look to be leftover from an early edition of this driver. Userspace
does not need this information. Checking all users of this that I have
access to I have verified no one is using them.

They leak internal use flags out to userspace. Even more they are not
correct anymore after a45ea4efa358. Lets drop these flags before
someone does try to use them for something and they become ABI.

Signed-off-by: Andrew Davis <afd@ti.com>
Acked-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c   | 2 --
 include/uapi/linux/tee.h | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 8aa1a4836b92..af0f7c603fa4 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -302,7 +302,6 @@ static int tee_ioctl_shm_alloc(struct tee_context *ctx,
 		return PTR_ERR(shm);
 
 	data.id = shm->id;
-	data.flags = shm->flags;
 	data.size = shm->size;
 
 	if (copy_to_user(udata, &data, sizeof(data)))
@@ -339,7 +338,6 @@ tee_ioctl_shm_register(struct tee_context *ctx,
 		return PTR_ERR(shm);
 
 	data.id = shm->id;
-	data.flags = shm->flags;
 	data.length = shm->size;
 
 	if (copy_to_user(udata, &data, sizeof(data)))
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 25a6c534beb1..23e57164693c 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -42,10 +42,6 @@
 #define TEE_IOC_MAGIC	0xa4
 #define TEE_IOC_BASE	0
 
-/* Flags relating to shared memory */
-#define TEE_IOCTL_SHM_MAPPED	0x1	/* memory mapped in normal world */
-#define TEE_IOCTL_SHM_DMA_BUF	0x2	/* dma-buf handle on shared memory */
-
 #define TEE_MAX_ARG_SIZE	1024
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
-- 
cgit v1.2.3


From cc51eaa8b530bf070e76847a717adcbf603469b7 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Tue, 26 Apr 2022 01:29:04 -0700
Subject: io_uring: add type to op enum

It is useful to have a type enum for opcodes, to allow the compiler to
assert that every value is used in a switch statement.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220426082907.3600028-2-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5fb52bf32435..49d1f3994f8d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -105,7 +105,7 @@ enum {
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
 
-enum {
+enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
 	IORING_OP_WRITEV,
-- 
cgit v1.2.3


From 052e1f01bfae8be6f31b61ed3a2356edfca855dc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 26 Apr 2022 10:54:33 -0700
Subject: net: atm: remove support for ZeitNet ZN122x ATM devices

This driver received nothing but automated fixes in the last 15 years.
Since it's using virt_to_bus it's unlikely to be used on any modern
platform.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/mips/configs/gpr_defconfig  |    1 -
 arch/mips/configs/mtx1_defconfig |    1 -
 drivers/atm/Kconfig              |   20 -
 drivers/atm/Makefile             |    1 -
 drivers/atm/uPD98401.h           |  293 -------
 drivers/atm/uPD98402.c           |  266 ------
 drivers/atm/uPD98402.h           |  107 ---
 drivers/atm/zatm.c               | 1652 --------------------------------------
 drivers/atm/zatm.h               |  104 ---
 include/uapi/linux/atm_zatm.h    |   47 --
 10 files changed, 2492 deletions(-)
 delete mode 100644 drivers/atm/uPD98401.h
 delete mode 100644 drivers/atm/uPD98402.c
 delete mode 100644 drivers/atm/uPD98402.h
 delete mode 100644 drivers/atm/zatm.c
 delete mode 100644 drivers/atm/zatm.h
 delete mode 100644 include/uapi/linux/atm_zatm.h

(limited to 'include/uapi/linux')

diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig
index 7ed202db9ef0..d82f4ebf687f 100644
--- a/arch/mips/configs/gpr_defconfig
+++ b/arch/mips/configs/gpr_defconfig
@@ -178,7 +178,6 @@ CONFIG_NETCONSOLE=m
 CONFIG_ATM_TCP=m
 CONFIG_ATM_LANAI=m
 CONFIG_ATM_ENI=m
-CONFIG_ATM_ZATM=m
 CONFIG_ATM_NICSTAR=m
 CONFIG_ATM_IDT77252=m
 CONFIG_ATM_IA=m
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index f46ad2e294fa..0cb4d9aa14d1 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -255,7 +255,6 @@ CONFIG_ARCNET_COM20020_CS=m
 CONFIG_ATM_TCP=m
 CONFIG_ATM_LANAI=m
 CONFIG_ATM_ENI=m
-CONFIG_ATM_ZATM=m
 CONFIG_ATM_NICSTAR=m
 CONFIG_ATM_IDT77252=m
 CONFIG_ATM_IA=m
diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig
index 9c778308722a..63cdb46a3439 100644
--- a/drivers/atm/Kconfig
+++ b/drivers/atm/Kconfig
@@ -146,26 +146,6 @@ config ATM_ENI_BURST_RX_2W
 	  try this if you have disabled 4W and 8W bursts. Enabling 2W if 4W or
 	  8W are also set may or may not improve throughput.
 
-config ATM_ZATM
-	tristate "ZeitNet ZN1221/ZN1225"
-	depends on PCI && VIRT_TO_BUS
-	help
-	  Driver for the ZeitNet ZN1221 (MMF) and ZN1225 (UTP-5) 155 Mbps ATM
-	  adapters.
-
-	  To compile this driver as a module, choose M here: the module will
-	  be called zatm.
-
-config ATM_ZATM_DEBUG
-	bool "Enable extended debugging"
-	depends on ATM_ZATM
-	help
-	  Extended debugging records various events and displays that list
-	  when an inconsistency is detected. This mechanism is faster than
-	  generally using printks, but still has some impact on performance.
-	  Note that extended debugging may create certain race conditions
-	  itself. Enable this ONLY if you suspect problems with the driver.
-
 config ATM_NICSTAR
 	tristate "IDT 77201 (NICStAR) (ForeRunnerLE)"
 	depends on PCI
diff --git a/drivers/atm/Makefile b/drivers/atm/Makefile
index 1b6a8ddaf007..c9eade92019b 100644
--- a/drivers/atm/Makefile
+++ b/drivers/atm/Makefile
@@ -5,7 +5,6 @@
 
 fore_200e-y	:= fore200e.o
 
-obj-$(CONFIG_ATM_ZATM)		+= zatm.o uPD98402.o
 obj-$(CONFIG_ATM_NICSTAR)	+= nicstar.o
 obj-$(CONFIG_ATM_IA)		+= iphase.o suni.o
 obj-$(CONFIG_ATM_FORE200E)	+= fore_200e.o
diff --git a/drivers/atm/uPD98401.h b/drivers/atm/uPD98401.h
deleted file mode 100644
index f766a5ef0c5d..000000000000
--- a/drivers/atm/uPD98401.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* drivers/atm/uPD98401.h - NEC uPD98401 (SAR) declarations */
- 
-/* Written 1995 by Werner Almesberger, EPFL LRC */
- 
-
-#ifndef DRIVERS_ATM_uPD98401_H
-#define DRIVERS_ATM_uPD98401_H
-
-
-#define MAX_CRAM_SIZE	(1 << 18)	/* 2^18 words */
-#define RAM_INCREMENT	1024		/* check in 4 kB increments */
-
-#define uPD98401_PORTS	0x24		/* probably more ? */
-
-
-/*
- * Commands
- */
-
-#define uPD98401_OPEN_CHAN	0x20000000 /* open channel */
-#define uPD98401_CHAN_ADDR	0x0003fff8 /*	channel address */
-#define uPD98401_CHAN_ADDR_SHIFT 3
-#define uPD98401_CLOSE_CHAN	0x24000000 /* close channel */
-#define uPD98401_CHAN_RT	0x02000000 /*	RX/TX (0 TX, 1 RX) */
-#define uPD98401_DEACT_CHAN	0x28000000 /* deactivate channel */
-#define uPD98401_TX_READY	0x30000000 /* TX ready */
-#define uPD98401_ADD_BAT	0x34000000 /* add batches */
-#define uPD98401_POOL		0x000f0000 /* pool number */
-#define uPD98401_POOL_SHIFT	16
-#define uPD98401_POOL_NUMBAT	0x0000ffff /* number of batches */
-#define uPD98401_NOP		0x3f000000 /* NOP */
-#define uPD98401_IND_ACC	0x00000000 /* Indirect Access */
-#define uPD98401_IA_RW		0x10000000 /*	Read/Write (0 W, 1 R) */
-#define uPD98401_IA_B3		0x08000000 /*	Byte select, 1 enable */
-#define uPD98401_IA_B2		0x04000000
-#define uPD98401_IA_B1		0x02000000
-#define uPD98401_IA_B0		0x01000000
-#define uPD98401_IA_BALL	0x0f000000 /*   whole longword */
-#define uPD98401_IA_TGT		0x000c0000 /*	Target */
-#define uPD98401_IA_TGT_SHIFT	18
-#define uPD98401_IA_TGT_CM	0	   /*	- Control Memory */
-#define uPD98401_IA_TGT_SAR	1	   /*	- uPD98401 registers */
-#define uPD98401_IA_TGT_PHY	3	   /*   - PHY device */
-#define uPD98401_IA_ADDR	0x0003ffff
-
-/*
- * Command Register Status
- */
-
-#define uPD98401_BUSY		0x80000000 /* SAR is busy */
-#define uPD98401_LOCKED		0x40000000 /* SAR is locked by other CPU */
-
-/*
- * Indications
- */
-
-/* Normal (AAL5) Receive Indication */
-#define uPD98401_AAL5_UINFO	0xffff0000 /* user-supplied information */
-#define uPD98401_AAL5_UINFO_SHIFT 16
-#define uPD98401_AAL5_SIZE	0x0000ffff /* PDU size (in _CELLS_ !!) */
-#define uPD98401_AAL5_CHAN	0x7fff0000 /* Channel number */
-#define uPD98401_AAL5_CHAN_SHIFT	16
-#define uPD98401_AAL5_ERR	0x00008000 /* Error indication */
-#define uPD98401_AAL5_CI	0x00004000 /* Congestion Indication */
-#define uPD98401_AAL5_CLP	0x00002000 /* CLP (>= 1 cell had CLP=1) */
-#define uPD98401_AAL5_ES	0x00000f00 /* Error Status */
-#define uPD98401_AAL5_ES_SHIFT	8
-#define uPD98401_AAL5_ES_NONE	0	   /*	No error */
-#define uPD98401_AAL5_ES_FREE	1	   /*	Receiver free buf underflow */
-#define uPD98401_AAL5_ES_FIFO	2	   /*	Receiver FIFO overrun */
-#define uPD98401_AAL5_ES_TOOBIG	3	   /*	Maximum length violation */
-#define uPD98401_AAL5_ES_CRC	4	   /*	CRC error */
-#define uPD98401_AAL5_ES_ABORT	5	   /*	User abort */
-#define uPD98401_AAL5_ES_LENGTH	6	   /*   Length violation */
-#define uPD98401_AAL5_ES_T1	7	   /*	T1 error (timeout) */
-#define uPD98401_AAL5_ES_DEACT	8	   /*	Deactivated with DEACT_CHAN */
-#define uPD98401_AAL5_POOL	0x0000001f /* Free buffer pool number */
-
-/* Raw Cell Indication */
-#define uPD98401_RAW_UINFO	uPD98401_AAL5_UINFO
-#define uPD98401_RAW_UINFO_SHIFT uPD98401_AAL5_UINFO_SHIFT
-#define uPD98401_RAW_HEC	0x000000ff /* HEC */
-#define uPD98401_RAW_CHAN	uPD98401_AAL5_CHAN
-#define uPD98401_RAW_CHAN_SHIFT uPD98401_AAL5_CHAN_SHIFT
-
-/* Transmit Indication */
-#define uPD98401_TXI_CONN	0x7fff0000 /* Connection Number */
-#define uPD98401_TXI_CONN_SHIFT	16
-#define uPD98401_TXI_ACTIVE	0x00008000 /* Channel remains active */
-#define uPD98401_TXI_PQP	0x00007fff /* Packet Queue Pointer */
-
-/*
- * Directly Addressable Registers
- */
-
-#define uPD98401_GMR	0x00	/* General Mode Register */
-#define uPD98401_GSR	0x01	/* General Status Register */
-#define uPD98401_IMR	0x02	/* Interrupt Mask Register */
-#define uPD98401_RQU	0x03	/* Receive Queue Underrun */
-#define uPD98401_RQA	0x04	/* Receive Queue Alert */
-#define uPD98401_ADDR	0x05	/* Last Burst Address */
-#define uPD98401_VER	0x06	/* Version Number */
-#define uPD98401_SWR	0x07	/* Software Reset */
-#define uPD98401_CMR	0x08	/* Command Register */
-#define uPD98401_CMR_L	0x09	/* Command Register and Lock/Unlock */
-#define uPD98401_CER	0x0a	/* Command Extension Register */
-#define uPD98401_CER_L	0x0b	/* Command Ext Reg and Lock/Unlock */
-
-#define uPD98401_MSH(n) (0x10+(n))	/* Mailbox n Start Address High */
-#define uPD98401_MSL(n) (0x14+(n))	/* Mailbox n Start Address High */
-#define uPD98401_MBA(n) (0x18+(n))	/* Mailbox n Bottom Address */
-#define uPD98401_MTA(n) (0x1c+(n))	/* Mailbox n Tail Address */
-#define uPD98401_MWA(n) (0x20+(n))	/* Mailbox n Write Address */
-
-/* GMR is at 0x00 */
-#define uPD98401_GMR_ONE	0x80000000 /* Must be set to one */
-#define uPD98401_GMR_SLM	0x40000000 /* Address mode (0 word, 1 byte) */
-#define uPD98401_GMR_CPE	0x00008000 /* Control Memory Parity Enable */
-#define uPD98401_GMR_LP		0x00004000 /* Loopback */
-#define uPD98401_GMR_WA		0x00002000 /* Early Bus Write Abort/RDY */
-#define uPD98401_GMR_RA		0x00001000 /* Early Read Abort/RDY */
-#define uPD98401_GMR_SZ		0x00000f00 /* Burst Size Enable */
-#define uPD98401_BURST16	0x00000800 /*	16-word burst */
-#define uPD98401_BURST8		0x00000400 /*	 8-word burst */
-#define uPD98401_BURST4		0x00000200 /*	 4-word burst */
-#define uPD98401_BURST2		0x00000100 /*	 2-word burst */
-#define uPD98401_GMR_AD		0x00000080 /* Address (burst resolution) Disable */
-#define uPD98401_GMR_BO		0x00000040 /* Byte Order (0 little, 1 big) */
-#define uPD98401_GMR_PM		0x00000020 /* Bus Parity Mode (0 byte, 1 word)*/
-#define uPD98401_GMR_PC		0x00000010 /* Bus Parity Control (0even,1odd) */
-#define uPD98401_GMR_BPE	0x00000008 /* Bus Parity Enable */
-#define uPD98401_GMR_DR		0x00000004 /* Receive Drop Mode (0drop,1don't)*/
-#define uPD98401_GMR_SE		0x00000002 /* Shapers Enable */
-#define uPD98401_GMR_RE		0x00000001 /* Receiver Enable */
-
-/* GSR is at 0x01, IMR is at 0x02 */
-#define uPD98401_INT_PI		0x80000000 /* PHY interrupt */
-#define uPD98401_INT_RQA	0x40000000 /* Receive Queue Alert */
-#define uPD98401_INT_RQU	0x20000000 /* Receive Queue Underrun */
-#define uPD98401_INT_RD		0x10000000 /* Receiver Deactivated */
-#define uPD98401_INT_SPE	0x08000000 /* System Parity Error */
-#define uPD98401_INT_CPE	0x04000000 /* Control Memory Parity Error */
-#define uPD98401_INT_SBE	0x02000000 /* System Bus Error */
-#define uPD98401_INT_IND	0x01000000 /* Initialization Done */
-#define uPD98401_INT_RCR	0x0000ff00 /* Raw Cell Received */
-#define uPD98401_INT_RCR_SHIFT	8
-#define uPD98401_INT_MF		0x000000f0 /* Mailbox Full */
-#define uPD98401_INT_MF_SHIFT	4
-#define uPD98401_INT_MM		0x0000000f /* Mailbox Modified */
-
-/* VER is at 0x06 */
-#define uPD98401_MAJOR		0x0000ff00 /* Major revision */
-#define uPD98401_MAJOR_SHIFT	8
-#define uPD98401_MINOR		0x000000ff /* Minor revision */
-
-/*
- * Indirectly Addressable Registers
- */
-
-#define uPD98401_IM(n)	(0x40000+(n))	/* Scheduler n I and M */
-#define uPD98401_X(n)	(0x40010+(n))	/* Scheduler n X */
-#define uPD98401_Y(n)	(0x40020+(n))	/* Scheduler n Y */
-#define uPD98401_PC(n)	(0x40030+(n))	/* Scheduler n P, C, p and c */
-#define uPD98401_PS(n)	(0x40040+(n))	/* Scheduler n priority and status */
-
-/* IM contents */
-#define uPD98401_IM_I		0xff000000 /* I */
-#define uPD98401_IM_I_SHIFT	24
-#define uPD98401_IM_M		0x00ffffff /* M */
-
-/* PC contents */
-#define uPD98401_PC_P		0xff000000 /* P */
-#define uPD98401_PC_P_SHIFT	24
-#define uPD98401_PC_C		0x00ff0000 /* C */
-#define uPD98401_PC_C_SHIFT	16
-#define uPD98401_PC_p		0x0000ff00 /* p */
-#define uPD98401_PC_p_SHIFT	8
-#define uPD98401_PC_c		0x000000ff /* c */
-
-/* PS contents */
-#define uPD98401_PS_PRIO	0xf0	/* Priority level (0 high, 15 low) */
-#define uPD98401_PS_PRIO_SHIFT	4
-#define uPD98401_PS_S		0x08	/* Scan - must be 0 (internal) */
-#define uPD98401_PS_R		0x04	/* Round Robin (internal) */
-#define uPD98401_PS_A		0x02	/* Active (internal) */
-#define uPD98401_PS_E		0x01	/* Enabled */
-
-#define uPD98401_TOS	0x40100	/* Top of Stack Control Memory Address */
-#define uPD98401_SMA	0x40200	/* Shapers Control Memory Start Address */
-#define uPD98401_PMA	0x40201	/* Receive Pool Control Memory Start Address */
-#define uPD98401_T1R	0x40300	/* T1 Register */
-#define uPD98401_VRR	0x40301	/* VPI/VCI Reduction Register/Recv. Shutdown */
-#define uPD98401_TSR	0x40302	/* Time-Stamp Register */
-
-/* VRR is at 0x40301 */
-#define uPD98401_VRR_SDM	0x80000000 /* Shutdown Mode */
-#define uPD98401_VRR_SHIFT	0x000f0000 /* VPI/VCI Shift */
-#define uPD98401_VRR_SHIFT_SHIFT 16
-#define uPD98401_VRR_MASK	0x0000ffff /* VPI/VCI mask */
-
-/*
- * TX packet descriptor
- */
-
-#define uPD98401_TXPD_SIZE	16	   /* descriptor size (in bytes) */
-
-#define uPD98401_TXPD_V		0x80000000 /* Valid bit */
-#define uPD98401_TXPD_DP	0x40000000 /* Descriptor (1) or Pointer (0) */
-#define uPD98401_TXPD_SM	0x20000000 /* Single (1) or Multiple (0) */
-#define uPD98401_TXPD_CLPM	0x18000000 /* CLP mode */
-#define uPD98401_CLPM_0		0	   /*	00 CLP = 0 */
-#define uPD98401_CLPM_1		3	   /*	11 CLP = 1 */
-#define uPD98401_CLPM_LAST	1	   /*	01 CLP unless last cell */
-#define uPD98401_TXPD_CLPM_SHIFT 27
-#define uPD98401_TXPD_PTI	0x07000000 /* PTI pattern */
-#define uPD98401_TXPD_PTI_SHIFT	24
-#define uPD98401_TXPD_GFC	0x00f00000 /* GFC pattern */
-#define uPD98401_TXPD_GFC_SHIFT	20
-#define uPD98401_TXPD_C10	0x00040000 /* insert CRC-10 */
-#define uPD98401_TXPD_AAL5	0x00020000 /* AAL5 processing */
-#define uPD98401_TXPD_MB	0x00010000 /* TX mailbox number */
-#define uPD98401_TXPD_UU	0x0000ff00 /* CPCS-UU */
-#define uPD98401_TXPD_UU_SHIFT	8
-#define uPD98401_TXPD_CPI	0x000000ff /* CPI */
-
-/*
- * TX buffer descriptor
- */
-
-#define uPD98401_TXBD_SIZE	8	   /* descriptor size (in bytes) */
-
-#define uPD98401_TXBD_LAST	0x80000000 /* last buffer in packet */
-
-/*
- * TX VC table
- */
-
-/* 1st word has the same structure as in a TX packet descriptor */
-#define uPD98401_TXVC_L		0x80000000 /* last buffer */
-#define uPD98401_TXVC_SHP	0x0f000000 /* shaper number */
-#define uPD98401_TXVC_SHP_SHIFT	24
-#define uPD98401_TXVC_VPI	0x00ff0000 /* VPI */
-#define uPD98401_TXVC_VPI_SHIFT	16
-#define uPD98401_TXVC_VCI	0x0000ffff /* VCI */
-#define uPD98401_TXVC_QRP	6	   /* Queue Read Pointer is in word 6 */
-
-/*
- * RX free buffer pools descriptor
- */
-
-#define uPD98401_RXFP_ALERT	0x70000000 /* low water mark */
-#define uPD98401_RXFP_ALERT_SHIFT 28
-#define uPD98401_RXFP_BFSZ	0x0f000000 /* buffer size, 64*2^n */
-#define uPD98401_RXFP_BFSZ_SHIFT 24
-#define uPD98401_RXFP_BTSZ	0x00ff0000 /* batch size, n+1 */
-#define uPD98401_RXFP_BTSZ_SHIFT 16
-#define uPD98401_RXFP_REMAIN	0x0000ffff /* remaining batches in pool */
-
-/*
- * RX VC table
- */
-
-#define uPD98401_RXVC_BTSZ	0xff000000 /* remaining free buffers in batch */
-#define uPD98401_RXVC_BTSZ_SHIFT 24
-#define uPD98401_RXVC_MB	0x00200000 /* RX mailbox number */
-#define uPD98401_RXVC_POOL	0x001f0000 /* free buffer pool number */
-#define uPD98401_RXVC_POOL_SHIFT 16
-#define uPD98401_RXVC_UINFO	0x0000ffff /* user-supplied information */
-#define uPD98401_RXVC_T1	0xffff0000 /* T1 timestamp */
-#define uPD98401_RXVC_T1_SHIFT	16
-#define uPD98401_RXVC_PR	0x00008000 /* Packet Reception, 1 if busy */
-#define uPD98401_RXVC_DR	0x00004000 /* FIFO Drop */
-#define uPD98401_RXVC_OD	0x00001000 /* Drop OAM cells */
-#define uPD98401_RXVC_AR	0x00000800 /* AAL5 or raw cell; 1 if AAL5 */
-#define uPD98401_RXVC_MAXSEG	0x000007ff /* max number of segments per PDU */
-#define uPD98401_RXVC_REM	0xfffe0000 /* remaining words in curr buffer */
-#define uPD98401_RXVC_REM_SHIFT	17
-#define uPD98401_RXVC_CLP	0x00010000 /* CLP received */
-#define uPD98401_RXVC_BFA	0x00008000 /* Buffer Assigned */
-#define uPD98401_RXVC_BTA	0x00004000 /* Batch Assigned */
-#define uPD98401_RXVC_CI	0x00002000 /* Congestion Indication */
-#define uPD98401_RXVC_DD	0x00001000 /* Dropping incoming cells */
-#define uPD98401_RXVC_DP	0x00000800 /* like PR ? */
-#define uPD98401_RXVC_CURSEG	0x000007ff /* Current Segment count */
-
-/*
- * RX lookup table
- */
-
-#define uPD98401_RXLT_ENBL	0x8000	   /* Enable */
-
-#endif
diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c
deleted file mode 100644
index 239852d85558..000000000000
--- a/drivers/atm/uPD98402.c
+++ /dev/null
@@ -1,266 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* drivers/atm/uPD98402.c - NEC uPD98402 (PHY) declarations */
- 
-/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
- 
-
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/atmdev.h>
-#include <linux/sonet.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-
-#include "uPD98402.h"
-
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-
-struct uPD98402_priv {
-	struct k_sonet_stats sonet_stats;/* link diagnostics */
-	unsigned char framing;		/* SONET/SDH framing */
-	int loop_mode;			/* loopback mode */
-	spinlock_t lock;
-};
-
-
-#define PRIV(dev) ((struct uPD98402_priv *) dev->phy_data)
-
-#define PUT(val,reg) dev->ops->phy_put(dev,val,uPD98402_##reg)
-#define GET(reg) dev->ops->phy_get(dev,uPD98402_##reg)
-
-
-static int fetch_stats(struct atm_dev *dev,struct sonet_stats __user *arg,int zero)
-{
-	struct sonet_stats tmp;
- 	int error = 0;
-
-	atomic_add(GET(HECCT),&PRIV(dev)->sonet_stats.uncorr_hcs);
-	sonet_copy_stats(&PRIV(dev)->sonet_stats,&tmp);
-	if (arg) error = copy_to_user(arg,&tmp,sizeof(tmp));
-	if (zero && !error) {
-		/* unused fields are reported as -1, but we must not "adjust"
-		   them */
-		tmp.corr_hcs = tmp.tx_cells = tmp.rx_cells = 0;
-		sonet_subtract_stats(&PRIV(dev)->sonet_stats,&tmp);
-	}
-	return error ? -EFAULT : 0;
-}
-
-
-static int set_framing(struct atm_dev *dev,unsigned char framing)
-{
-	static const unsigned char sonet[] = { 1,2,3,0 };
-	static const unsigned char sdh[] = { 1,0,0,2 };
-	const char *set;
-	unsigned long flags;
- 
-	switch (framing) {
-		case SONET_FRAME_SONET:
-			set = sonet;
-			break;
-		case SONET_FRAME_SDH:
-			set = sdh;
-			break;
-		default:
-			return -EINVAL;
-	}
-	spin_lock_irqsave(&PRIV(dev)->lock, flags);
-	PUT(set[0],C11T);
-	PUT(set[1],C12T);
-	PUT(set[2],C13T);
-	PUT((GET(MDR) & ~uPD98402_MDR_SS_MASK) | (set[3] <<
-	    uPD98402_MDR_SS_SHIFT),MDR);
-	spin_unlock_irqrestore(&PRIV(dev)->lock, flags);
-	return 0;
-}
-
-
-static int get_sense(struct atm_dev *dev,u8 __user *arg)
-{
-	unsigned long flags;
-	unsigned char s[3];
-
-	spin_lock_irqsave(&PRIV(dev)->lock, flags);
-	s[0] = GET(C11R);
-	s[1] = GET(C12R);
-	s[2] = GET(C13R);
-	spin_unlock_irqrestore(&PRIV(dev)->lock, flags);
-	return (put_user(s[0], arg) || put_user(s[1], arg+1) ||
-	    put_user(s[2], arg+2) || put_user(0xff, arg+3) ||
-	    put_user(0xff, arg+4) || put_user(0xff, arg+5)) ? -EFAULT : 0;
-}
-
-
-static int set_loopback(struct atm_dev *dev,int mode)
-{
-	unsigned char mode_reg;
-
-	mode_reg = GET(MDR) & ~(uPD98402_MDR_TPLP | uPD98402_MDR_ALP |
-	    uPD98402_MDR_RPLP);
-	switch (__ATM_LM_XTLOC(mode)) {
-		case __ATM_LM_NONE:
-			break;
-		case __ATM_LM_PHY:
-			mode_reg |= uPD98402_MDR_TPLP;
-			break;
-		case __ATM_LM_ATM:
-			mode_reg |= uPD98402_MDR_ALP;
-			break;
-		default:
-			return -EINVAL;
-	}
-	switch (__ATM_LM_XTRMT(mode)) {
-		case __ATM_LM_NONE:
-			break;
-		case __ATM_LM_PHY:
-			mode_reg |= uPD98402_MDR_RPLP;
-			break;
-		default:
-			return -EINVAL;
-	}
-	PUT(mode_reg,MDR);
-	PRIV(dev)->loop_mode = mode;
-	return 0;
-}
-
-
-static int uPD98402_ioctl(struct atm_dev *dev,unsigned int cmd,void __user *arg)
-{
-	switch (cmd) {
-
-		case SONET_GETSTATZ:
-                case SONET_GETSTAT:
-			return fetch_stats(dev,arg, cmd == SONET_GETSTATZ);
-		case SONET_SETFRAMING:
-			return set_framing(dev, (int)(unsigned long)arg);
-		case SONET_GETFRAMING:
-			return put_user(PRIV(dev)->framing,(int __user *)arg) ?
-			    -EFAULT : 0;
-		case SONET_GETFRSENSE:
-			return get_sense(dev,arg);
-		case ATM_SETLOOP:
-			return set_loopback(dev, (int)(unsigned long)arg);
-		case ATM_GETLOOP:
-			return put_user(PRIV(dev)->loop_mode,(int __user *)arg) ?
-			    -EFAULT : 0;
-		case ATM_QUERYLOOP:
-			return put_user(ATM_LM_LOC_PHY | ATM_LM_LOC_ATM |
-			    ATM_LM_RMT_PHY,(int __user *)arg) ? -EFAULT : 0;
-		default:
-			return -ENOIOCTLCMD;
-	}
-}
-
-
-#define ADD_LIMITED(s,v) \
-    { atomic_add(GET(v),&PRIV(dev)->sonet_stats.s); \
-    if (atomic_read(&PRIV(dev)->sonet_stats.s) < 0) \
-	atomic_set(&PRIV(dev)->sonet_stats.s,INT_MAX); }
-
-
-static void stat_event(struct atm_dev *dev)
-{
-	unsigned char events;
-
-	events = GET(PCR);
-	if (events & uPD98402_PFM_PFEB) ADD_LIMITED(path_febe,PFECB);
-	if (events & uPD98402_PFM_LFEB) ADD_LIMITED(line_febe,LECCT);
-	if (events & uPD98402_PFM_B3E) ADD_LIMITED(path_bip,B3ECT);
-	if (events & uPD98402_PFM_B2E) ADD_LIMITED(line_bip,B2ECT);
-	if (events & uPD98402_PFM_B1E) ADD_LIMITED(section_bip,B1ECT);
-}
-
-
-#undef ADD_LIMITED
-
-
-static void uPD98402_int(struct atm_dev *dev)
-{
-	static unsigned long silence = 0;
-	unsigned char reason;
-
-	while ((reason = GET(PICR))) {
-		if (reason & uPD98402_INT_LOS)
-			printk(KERN_NOTICE "%s(itf %d): signal lost\n",
-			    dev->type,dev->number);
-		if (reason & uPD98402_INT_PFM) stat_event(dev);
-		if (reason & uPD98402_INT_PCO) {
-			(void) GET(PCOCR); /* clear interrupt cause */
-			atomic_add(GET(HECCT),
-			    &PRIV(dev)->sonet_stats.uncorr_hcs);
-		}
-		if ((reason & uPD98402_INT_RFO) && 
-		    (time_after(jiffies, silence) || silence == 0)) {
-			printk(KERN_WARNING "%s(itf %d): uPD98402 receive "
-			    "FIFO overflow\n",dev->type,dev->number);
-			silence = (jiffies+HZ/2)|1;
-		}
-	}
-}
-
-
-static int uPD98402_start(struct atm_dev *dev)
-{
-	DPRINTK("phy_start\n");
-	if (!(dev->phy_data = kmalloc(sizeof(struct uPD98402_priv),GFP_KERNEL)))
-		return -ENOMEM;
-	spin_lock_init(&PRIV(dev)->lock);
-	memset(&PRIV(dev)->sonet_stats,0,sizeof(struct k_sonet_stats));
-	(void) GET(PCR); /* clear performance events */
-	PUT(uPD98402_PFM_FJ,PCMR); /* ignore frequency adj */
-	(void) GET(PCOCR); /* clear overflows */
-	PUT(~uPD98402_PCO_HECC,PCOMR);
-	(void) GET(PICR); /* clear interrupts */
-	PUT(~(uPD98402_INT_PFM | uPD98402_INT_ALM | uPD98402_INT_RFO |
-	  uPD98402_INT_LOS),PIMR); /* enable them */
-	(void) fetch_stats(dev,NULL,1); /* clear kernel counters */
-	atomic_set(&PRIV(dev)->sonet_stats.corr_hcs,-1);
-	atomic_set(&PRIV(dev)->sonet_stats.tx_cells,-1);
-	atomic_set(&PRIV(dev)->sonet_stats.rx_cells,-1);
-	return 0;
-}
-
-
-static int uPD98402_stop(struct atm_dev *dev)
-{
-	/* let SAR driver worry about stopping interrupts */
-	kfree(PRIV(dev));
-	return 0;
-}
-
-
-static const struct atmphy_ops uPD98402_ops = {
-	.start		= uPD98402_start,
-	.ioctl		= uPD98402_ioctl,
-	.interrupt	= uPD98402_int,
-	.stop		= uPD98402_stop,
-};
-
-
-int uPD98402_init(struct atm_dev *dev)
-{
-DPRINTK("phy_init\n");
-	dev->phy = &uPD98402_ops;
-	return 0;
-}
-
-
-MODULE_LICENSE("GPL");
-
-EXPORT_SYMBOL(uPD98402_init);
-
-static __init int uPD98402_module_init(void)
-{
-	return 0;
-}
-module_init(uPD98402_module_init);
-/* module_exit not defined so not unloadable */
diff --git a/drivers/atm/uPD98402.h b/drivers/atm/uPD98402.h
deleted file mode 100644
index 437cfaa20c96..000000000000
--- a/drivers/atm/uPD98402.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* drivers/atm/uPD98402.h - NEC uPD98402 (PHY) declarations */
- 
-/* Written 1995 by Werner Almesberger, EPFL LRC */
-
-
-#ifndef DRIVERS_ATM_uPD98402_H
-#define DRIVERS_ATM_uPD98402_H
-
-/*
- * Registers
- */
-
-#define uPD98402_CMR		0x00	/* Command Register */
-#define uPD98402_MDR		0x01	/* Mode Register */
-#define uPD98402_PICR		0x02	/* PHY Interrupt Cause Register */
-#define uPD98402_PIMR		0x03	/* PHY Interrupt Mask Register */
-#define uPD98402_ACR		0x04	/* Alarm Cause Register */
-#define uPD98402_ACMR		0x05	/* Alarm Cause Mask Register */
-#define uPD98402_PCR		0x06	/* Performance Cause Register */
-#define uPD98402_PCMR		0x07	/* Performance Cause Mask Register */
-#define uPD98402_IACM		0x08	/* Internal Alarm Cause Mask Register */
-#define uPD98402_B1ECT		0x09	/* B1 Error Count Register */
-#define uPD98402_B2ECT		0x0a	/* B2 Error Count Register */
-#define uPD98402_B3ECT		0x0b	/* B3 Error Count Regster */
-#define uPD98402_PFECB		0x0c	/* Path FEBE Count Register */
-#define uPD98402_LECCT		0x0d	/* Line FEBE Count Register */
-#define uPD98402_HECCT		0x0e	/* HEC Error Count Register */
-#define uPD98402_FJCT		0x0f	/* Frequence Justification Count Reg */
-#define uPD98402_PCOCR		0x10	/* Perf. Counter Overflow Cause Reg */
-#define uPD98402_PCOMR		0x11	/* Perf. Counter Overflow Mask Reg */
-#define uPD98402_C11T		0x20	/* C11T Data Register */
-#define uPD98402_C12T		0x21	/* C12T Data Register */
-#define uPD98402_C13T		0x22	/* C13T Data Register */
-#define uPD98402_F1T		0x23	/* F1T Data Register */
-#define uPD98402_K2T		0x25	/* K2T Data Register */
-#define uPD98402_C2T		0x26	/* C2T Data Register */
-#define uPD98402_F2T		0x27	/* F2T Data Register */
-#define uPD98402_C11R		0x30	/* C11T Data Register */
-#define uPD98402_C12R		0x31	/* C12T Data Register */
-#define uPD98402_C13R		0x32	/* C13T Data Register */
-#define uPD98402_F1R		0x33	/* F1T Data Register */
-#define uPD98402_K2R		0x35	/* K2T Data Register */
-#define uPD98402_C2R		0x36	/* C2T Data Register */
-#define uPD98402_F2R		0x37	/* F2T Data Register */
-
-/* CMR is at 0x00 */
-#define uPD98402_CMR_PFRF	0x01	/* Send path FERF */
-#define uPD98402_CMR_LFRF	0x02	/* Send line FERF */
-#define uPD98402_CMR_PAIS	0x04	/* Send path AIS */
-#define uPD98402_CMR_LAIS	0x08	/* Send line AIS */
-
-/* MDR is at 0x01 */
-#define uPD98402_MDR_ALP	0x01	/* ATM layer loopback */
-#define uPD98402_MDR_TPLP	0x02	/* PMD loopback, to host */
-#define uPD98402_MDR_RPLP	0x04	/* PMD loopback, to network */
-#define uPD98402_MDR_SS0	0x08	/* SS0 */
-#define uPD98402_MDR_SS1	0x10	/* SS1 */
-#define uPD98402_MDR_SS_MASK	0x18	/* mask */
-#define uPD98402_MDR_SS_SHIFT	3	/* shift */
-#define uPD98402_MDR_HEC	0x20	/* disable HEC inbound processing */
-#define uPD98402_MDR_FSR	0x40	/* disable frame scrambler */
-#define uPD98402_MDR_CSR	0x80	/* disable cell scrambler */
-
-/* PICR is at 0x02, PIMR is at 0x03 */
-#define uPD98402_INT_PFM	0x01	/* performance counter has changed */
-#define uPD98402_INT_ALM	0x02	/* line fault */
-#define uPD98402_INT_RFO	0x04	/* receive FIFO overflow */
-#define uPD98402_INT_PCO	0x08	/* performance counter overflow */
-#define uPD98402_INT_OTD	0x20	/* OTD has occurred */
-#define uPD98402_INT_LOS	0x40	/* Loss Of Signal */
-#define uPD98402_INT_LOF	0x80	/* Loss Of Frame */
-
-/* ACR is as 0x04, ACMR is at 0x05 */
-#define uPD98402_ALM_PFRF	0x01	/* path FERF */
-#define uPD98402_ALM_LFRF	0x02	/* line FERF */
-#define uPD98402_ALM_PAIS	0x04	/* path AIS */
-#define uPD98402_ALM_LAIS	0x08	/* line AIS */
-#define uPD98402_ALM_LOD	0x10	/* loss of delineation */
-#define uPD98402_ALM_LOP	0x20	/* loss of pointer */
-#define uPD98402_ALM_OOF	0x40	/* out of frame */
-
-/* PCR is at 0x06, PCMR is at 0x07 */
-#define uPD98402_PFM_PFEB	0x01	/* path FEBE */
-#define uPD98402_PFM_LFEB	0x02	/* line FEBE */
-#define uPD98402_PFM_B3E	0x04	/* B3 error */
-#define uPD98402_PFM_B2E	0x08	/* B2 error */
-#define uPD98402_PFM_B1E	0x10	/* B1 error */
-#define uPD98402_PFM_FJ		0x20	/* frequency justification */
-
-/* IACM is at 0x08 */
-#define uPD98402_IACM_PFRF	0x01	/* don't generate path FERF */
-#define uPD98402_IACM_LFRF	0x02	/* don't generate line FERF */
-
-/* PCOCR is at 0x010, PCOMR is at 0x11 */
-#define uPD98402_PCO_B1EC	0x01	/* B1ECT overflow */
-#define uPD98402_PCO_B2EC	0x02	/* B2ECT overflow */
-#define uPD98402_PCO_B3EC	0x04	/* B3ECT overflow */
-#define uPD98402_PCO_PFBC	0x08	/* PFEBC overflow */
-#define uPD98402_PCO_LFBC	0x10	/* LFEVC overflow */
-#define uPD98402_PCO_HECC	0x20	/* HECCT overflow */
-#define uPD98402_PCO_FJC	0x40	/* FJCT overflow */
-
-
-int uPD98402_init(struct atm_dev *dev);
-
-#endif
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
deleted file mode 100644
index cf5fffcf98a1..000000000000
--- a/drivers/atm/zatm.c
+++ /dev/null
@@ -1,1652 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* drivers/atm/zatm.c - ZeitNet ZN122x device driver */
- 
-/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/atm.h>
-#include <linux/atmdev.h>
-#include <linux/sonet.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/delay.h>
-#include <linux/uio.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/dma-mapping.h>
-#include <linux/atm_zatm.h>
-#include <linux/capability.h>
-#include <linux/bitops.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-#include <asm/string.h>
-#include <asm/io.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <linux/nospec.h>
-
-#include "uPD98401.h"
-#include "uPD98402.h"
-#include "zeprom.h"
-#include "zatm.h"
-
-
-/*
- * TODO:
- *
- * Minor features
- *  - support 64 kB SDUs (will have to use multibuffer batches then :-( )
- *  - proper use of CDV, credit = max(1,CDVT*PCR)
- *  - AAL0
- *  - better receive timestamps
- *  - OAM
- */
-
-#define ZATM_COPPER	1
-
-#if 0
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#ifndef CONFIG_ATM_ZATM_DEBUG
-
-
-#define NULLCHECK(x)
-
-#define EVENT(s,a,b)
-
-
-static void event_dump(void)
-{
-}
-
-
-#else
-
-
-/* 
- * NULL pointer checking
- */
-
-#define NULLCHECK(x) \
-  if ((unsigned long) (x) < 0x30) printk(KERN_CRIT #x "==0x%x\n", (int) (x))
-
-/*
- * Very extensive activity logging. Greatly improves bug detection speed but
- * costs a few Mbps if enabled.
- */
-
-#define EV 64
-
-static const char *ev[EV];
-static unsigned long ev_a[EV],ev_b[EV];
-static int ec = 0;
-
-
-static void EVENT(const char *s,unsigned long a,unsigned long b)
-{
-	ev[ec] = s; 
-	ev_a[ec] = a;
-	ev_b[ec] = b;
-	ec = (ec+1) % EV;
-}
-
-
-static void event_dump(void)
-{
-	int n,i;
-
-	printk(KERN_NOTICE "----- event dump follows -----\n");
-	for (n = 0; n < EV; n++) {
-		i = (ec+n) % EV;
-		printk(KERN_NOTICE);
-		printk(ev[i] ? ev[i] : "(null)",ev_a[i],ev_b[i]);
-	}
-	printk(KERN_NOTICE "----- event dump ends here -----\n");
-}
-
-
-#endif /* CONFIG_ATM_ZATM_DEBUG */
-
-
-#define RING_BUSY	1	/* indication from do_tx that PDU has to be
-				   backlogged */
-
-static struct atm_dev *zatm_boards = NULL;
-static unsigned long dummy[2] = {0,0};
-
-
-#define zin_n(r) inl(zatm_dev->base+r*4)
-#define zin(r) inl(zatm_dev->base+uPD98401_##r*4)
-#define zout(v,r) outl(v,zatm_dev->base+uPD98401_##r*4)
-#define zwait() do {} while (zin(CMR) & uPD98401_BUSY)
-
-/* RX0, RX1, TX0, TX1 */
-static const int mbx_entries[NR_MBX] = { 1024,1024,1024,1024 };
-static const int mbx_esize[NR_MBX] = { 16,16,4,4 }; /* entry size in bytes */
-
-#define MBX_SIZE(i) (mbx_entries[i]*mbx_esize[i])
-
-
-/*-------------------------------- utilities --------------------------------*/
-
-
-static void zpokel(struct zatm_dev *zatm_dev,u32 value,u32 addr)
-{
-	zwait();
-	zout(value,CER);
-	zout(uPD98401_IND_ACC | uPD98401_IA_BALL |
-	    (uPD98401_IA_TGT_CM << uPD98401_IA_TGT_SHIFT) | addr,CMR);
-}
-
-
-static u32 zpeekl(struct zatm_dev *zatm_dev,u32 addr)
-{
-	zwait();
-	zout(uPD98401_IND_ACC | uPD98401_IA_BALL | uPD98401_IA_RW |
-	  (uPD98401_IA_TGT_CM << uPD98401_IA_TGT_SHIFT) | addr,CMR);
-	zwait();
-	return zin(CER);
-}
-
-
-/*------------------------------- free lists --------------------------------*/
-
-
-/*
- * Free buffer head structure:
- *   [0] pointer to buffer (for SAR)
- *   [1] buffer descr link pointer (for SAR)
- *   [2] back pointer to skb (for poll_rx)
- *   [3] data
- *   ...
- */
-
-struct rx_buffer_head {
-	u32		buffer;	/* pointer to buffer (for SAR) */
-	u32		link;	/* buffer descriptor link pointer (for SAR) */
-	struct sk_buff	*skb;	/* back pointer to skb (for poll_rx) */
-};
-
-
-static void refill_pool(struct atm_dev *dev,int pool)
-{
-	struct zatm_dev *zatm_dev;
-	struct sk_buff *skb;
-	struct rx_buffer_head *first;
-	unsigned long flags;
-	int align,offset,free,count,size;
-
-	EVENT("refill_pool\n",0,0);
-	zatm_dev = ZATM_DEV(dev);
-	size = (64 << (pool <= ZATM_AAL5_POOL_BASE ? 0 :
-	    pool-ZATM_AAL5_POOL_BASE))+sizeof(struct rx_buffer_head);
-	if (size < PAGE_SIZE) {
-		align = 32; /* for 32 byte alignment */
-		offset = sizeof(struct rx_buffer_head);
-	}
-	else {
-		align = 4096;
-		offset = zatm_dev->pool_info[pool].offset+
-		    sizeof(struct rx_buffer_head);
-	}
-	size += align;
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	free = zpeekl(zatm_dev,zatm_dev->pool_base+2*pool) &
-	    uPD98401_RXFP_REMAIN;
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	if (free >= zatm_dev->pool_info[pool].low_water) return;
-	EVENT("starting ... POOL: 0x%x, 0x%x\n",
-	    zpeekl(zatm_dev,zatm_dev->pool_base+2*pool),
-	    zpeekl(zatm_dev,zatm_dev->pool_base+2*pool+1));
-	EVENT("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]);
-	count = 0;
-	first = NULL;
-	while (free < zatm_dev->pool_info[pool].high_water) {
-		struct rx_buffer_head *head;
-
-		skb = alloc_skb(size,GFP_ATOMIC);
-		if (!skb) {
-			printk(KERN_WARNING DEV_LABEL "(Itf %d): got no new "
-			    "skb (%d) with %d free\n",dev->number,size,free);
-			break;
-		}
-		skb_reserve(skb,(unsigned char *) ((((unsigned long) skb->data+
-		    align+offset-1) & ~(unsigned long) (align-1))-offset)-
-		    skb->data);
-		head = (struct rx_buffer_head *) skb->data;
-		skb_reserve(skb,sizeof(struct rx_buffer_head));
-		if (!first) first = head;
-		count++;
-		head->buffer = virt_to_bus(skb->data);
-		head->link = 0;
-		head->skb = skb;
-		EVENT("enq skb 0x%08lx/0x%08lx\n",(unsigned long) skb,
-		    (unsigned long) head);
-		spin_lock_irqsave(&zatm_dev->lock, flags);
-		if (zatm_dev->last_free[pool])
-			((struct rx_buffer_head *) (zatm_dev->last_free[pool]->
-			    data))[-1].link = virt_to_bus(head);
-		zatm_dev->last_free[pool] = skb;
-		skb_queue_tail(&zatm_dev->pool[pool],skb);
-		spin_unlock_irqrestore(&zatm_dev->lock, flags);
-		free++;
-	}
-	if (first) {
-		spin_lock_irqsave(&zatm_dev->lock, flags);
-		zwait();
-		zout(virt_to_bus(first),CER);
-		zout(uPD98401_ADD_BAT | (pool << uPD98401_POOL_SHIFT) | count,
-		    CMR);
-		spin_unlock_irqrestore(&zatm_dev->lock, flags);
-		EVENT ("POOL: 0x%x, 0x%x\n",
-		    zpeekl(zatm_dev,zatm_dev->pool_base+2*pool),
-		    zpeekl(zatm_dev,zatm_dev->pool_base+2*pool+1));
-		EVENT("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]);
-	}
-}
-
-
-static void drain_free(struct atm_dev *dev,int pool)
-{
-	skb_queue_purge(&ZATM_DEV(dev)->pool[pool]);
-}
-
-
-static int pool_index(int max_pdu)
-{
-	int i;
-
-	if (max_pdu % ATM_CELL_PAYLOAD)
-		printk(KERN_ERR DEV_LABEL ": driver error in pool_index: "
-		    "max_pdu is %d\n",max_pdu);
-	if (max_pdu > 65536) return -1;
-	for (i = 0; (64 << i) < max_pdu; i++);
-	return i+ZATM_AAL5_POOL_BASE;
-}
-
-
-/* use_pool isn't reentrant */
-
-
-static void use_pool(struct atm_dev *dev,int pool)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long flags;
-	int size;
-
-	zatm_dev = ZATM_DEV(dev);
-	if (!(zatm_dev->pool_info[pool].ref_count++)) {
-		skb_queue_head_init(&zatm_dev->pool[pool]);
-		size = pool-ZATM_AAL5_POOL_BASE;
-		if (size < 0) size = 0; /* 64B... */
-		else if (size > 10) size = 10; /* ... 64kB */
-		spin_lock_irqsave(&zatm_dev->lock, flags);
-		zpokel(zatm_dev,((zatm_dev->pool_info[pool].low_water/4) <<
-		    uPD98401_RXFP_ALERT_SHIFT) |
-		    (1 << uPD98401_RXFP_BTSZ_SHIFT) |
-		    (size << uPD98401_RXFP_BFSZ_SHIFT),
-		    zatm_dev->pool_base+pool*2);
-		zpokel(zatm_dev,(unsigned long) dummy,zatm_dev->pool_base+
-		    pool*2+1);
-		spin_unlock_irqrestore(&zatm_dev->lock, flags);
-		zatm_dev->last_free[pool] = NULL;
-		refill_pool(dev,pool);
-	}
-	DPRINTK("pool %d: %d\n",pool,zatm_dev->pool_info[pool].ref_count);
-}
-
-
-static void unuse_pool(struct atm_dev *dev,int pool)
-{
-	if (!(--ZATM_DEV(dev)->pool_info[pool].ref_count))
-		drain_free(dev,pool);
-}
-
-/*----------------------------------- RX ------------------------------------*/
-
-
-#if 0
-static void exception(struct atm_vcc *vcc)
-{
-   static int count = 0;
-   struct zatm_dev *zatm_dev = ZATM_DEV(vcc->dev);
-   struct zatm_vcc *zatm_vcc = ZATM_VCC(vcc);
-   unsigned long *qrp;
-   int i;
-
-   if (count++ > 2) return;
-   for (i = 0; i < 8; i++)
-	printk("TX%d: 0x%08lx\n",i,
-	  zpeekl(zatm_dev,zatm_vcc->tx_chan*VC_SIZE/4+i));
-   for (i = 0; i < 5; i++)
-	printk("SH%d: 0x%08lx\n",i,
-	  zpeekl(zatm_dev,uPD98401_IM(zatm_vcc->shaper)+16*i));
-   qrp = (unsigned long *) zpeekl(zatm_dev,zatm_vcc->tx_chan*VC_SIZE/4+
-     uPD98401_TXVC_QRP);
-   printk("qrp=0x%08lx\n",(unsigned long) qrp);
-   for (i = 0; i < 4; i++) printk("QRP[%d]: 0x%08lx",i,qrp[i]);
-}
-#endif
-
-
-static const char *err_txt[] = {
-	"No error",
-	"RX buf underflow",
-	"RX FIFO overrun",
-	"Maximum len violation",
-	"CRC error",
-	"User abort",
-	"Length violation",
-	"T1 error",
-	"Deactivated",
-	"???",
-	"???",
-	"???",
-	"???",
-	"???",
-	"???",
-	"???"
-};
-
-
-static void poll_rx(struct atm_dev *dev,int mbx)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long pos;
-	u32 x;
-	int error;
-
-	EVENT("poll_rx\n",0,0);
-	zatm_dev = ZATM_DEV(dev);
-	pos = (zatm_dev->mbx_start[mbx] & ~0xffffUL) | zin(MTA(mbx));
-	while (x = zin(MWA(mbx)), (pos & 0xffff) != x) {
-		u32 *here;
-		struct sk_buff *skb;
-		struct atm_vcc *vcc;
-		int cells,size,chan;
-
-		EVENT("MBX: host 0x%lx, nic 0x%x\n",pos,x);
-		here = (u32 *) pos;
-		if (((pos += 16) & 0xffff) == zatm_dev->mbx_end[mbx])
-			pos = zatm_dev->mbx_start[mbx];
-		cells = here[0] & uPD98401_AAL5_SIZE;
-#if 0
-printk("RX IND: 0x%x, 0x%x, 0x%x, 0x%x\n",here[0],here[1],here[2],here[3]);
-{
-unsigned long *x;
-		printk("POOL: 0x%08x, 0x%08x\n",zpeekl(zatm_dev,
-		      zatm_dev->pool_base),
-		      zpeekl(zatm_dev,zatm_dev->pool_base+1));
-		x = (unsigned long *) here[2];
-		printk("[0..3] = 0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx\n",
-		    x[0],x[1],x[2],x[3]);
-}
-#endif
-		error = 0;
-		if (here[3] & uPD98401_AAL5_ERR) {
-			error = (here[3] & uPD98401_AAL5_ES) >>
-			    uPD98401_AAL5_ES_SHIFT;
-			if (error == uPD98401_AAL5_ES_DEACT ||
-			    error == uPD98401_AAL5_ES_FREE) continue;
-		}
-EVENT("error code 0x%x/0x%x\n",(here[3] & uPD98401_AAL5_ES) >>
-  uPD98401_AAL5_ES_SHIFT,error);
-		skb = ((struct rx_buffer_head *) bus_to_virt(here[2]))->skb;
-		__net_timestamp(skb);
-#if 0
-printk("[-3..0] 0x%08lx 0x%08lx 0x%08lx 0x%08lx\n",((unsigned *) skb->data)[-3],
-  ((unsigned *) skb->data)[-2],((unsigned *) skb->data)[-1],
-  ((unsigned *) skb->data)[0]);
-#endif
-		EVENT("skb 0x%lx, here 0x%lx\n",(unsigned long) skb,
-		    (unsigned long) here);
-#if 0
-printk("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]);
-#endif
-		size = error ? 0 : ntohs(((__be16 *) skb->data)[cells*
-		    ATM_CELL_PAYLOAD/sizeof(u16)-3]);
-		EVENT("got skb 0x%lx, size %d\n",(unsigned long) skb,size);
-		chan = (here[3] & uPD98401_AAL5_CHAN) >>
-		    uPD98401_AAL5_CHAN_SHIFT;
-		if (chan < zatm_dev->chans && zatm_dev->rx_map[chan]) {
-			int pos;
-			vcc = zatm_dev->rx_map[chan];
-			pos = ZATM_VCC(vcc)->pool;
-			if (skb == zatm_dev->last_free[pos])
-				zatm_dev->last_free[pos] = NULL;
-			skb_unlink(skb, zatm_dev->pool + pos);
-		}
-		else {
-			printk(KERN_ERR DEV_LABEL "(itf %d): RX indication "
-			    "for non-existing channel\n",dev->number);
-			size = 0;
-			vcc = NULL;
-			event_dump();
-		}
-		if (error) {
-			static unsigned long silence = 0;
-			static int last_error = 0;
-
-			if (error != last_error ||
-			    time_after(jiffies, silence)  || silence == 0){
-				printk(KERN_WARNING DEV_LABEL "(itf %d): "
-				    "chan %d error %s\n",dev->number,chan,
-				    err_txt[error]);
-				last_error = error;
-				silence = (jiffies+2*HZ)|1;
-			}
-			size = 0;
-		}
-		if (size && (size > cells*ATM_CELL_PAYLOAD-ATM_AAL5_TRAILER ||
-		    size <= (cells-1)*ATM_CELL_PAYLOAD-ATM_AAL5_TRAILER)) {
-			printk(KERN_ERR DEV_LABEL "(itf %d): size %d with %d "
-			    "cells\n",dev->number,size,cells);
-			size = 0;
-			event_dump();
-		}
-		if (size > ATM_MAX_AAL5_PDU) {
-			printk(KERN_ERR DEV_LABEL "(itf %d): size too big "
-			    "(%d)\n",dev->number,size);
-			size = 0;
-			event_dump();
-		}
-		if (!size) {
-			dev_kfree_skb_irq(skb);
-			if (vcc) atomic_inc(&vcc->stats->rx_err);
-			continue;
-		}
-		if (!atm_charge(vcc,skb->truesize)) {
-			dev_kfree_skb_irq(skb);
-			continue;
-		}
-		skb->len = size;
-		ATM_SKB(skb)->vcc = vcc;
-		vcc->push(vcc,skb);
-		atomic_inc(&vcc->stats->rx);
-	}
-	zout(pos & 0xffff,MTA(mbx));
-#if 0 /* probably a stupid idea */
-	refill_pool(dev,zatm_vcc->pool);
-		/* maybe this saves us a few interrupts */
-#endif
-}
-
-
-static int open_rx_first(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-	unsigned short chan;
-	int cells;
-
-	DPRINTK("open_rx_first (0x%x)\n",inb_p(0xc053));
-	zatm_dev = ZATM_DEV(vcc->dev);
-	zatm_vcc = ZATM_VCC(vcc);
-	zatm_vcc->rx_chan = 0;
-	if (vcc->qos.rxtp.traffic_class == ATM_NONE) return 0;
-	if (vcc->qos.aal == ATM_AAL5) {
-		if (vcc->qos.rxtp.max_sdu > 65464)
-			vcc->qos.rxtp.max_sdu = 65464;
-			/* fix this - we may want to receive 64kB SDUs
-			   later */
-		cells = DIV_ROUND_UP(vcc->qos.rxtp.max_sdu + ATM_AAL5_TRAILER,
-				ATM_CELL_PAYLOAD);
-		zatm_vcc->pool = pool_index(cells*ATM_CELL_PAYLOAD);
-	}
-	else {
-		cells = 1;
-		zatm_vcc->pool = ZATM_AAL0_POOL;
-	}
-	if (zatm_vcc->pool < 0) return -EMSGSIZE;
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zwait();
-	zout(uPD98401_OPEN_CHAN,CMR);
-	zwait();
-	DPRINTK("0x%x 0x%x\n",zin(CMR),zin(CER));
-	chan = (zin(CMR) & uPD98401_CHAN_ADDR) >> uPD98401_CHAN_ADDR_SHIFT;
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	DPRINTK("chan is %d\n",chan);
-	if (!chan) return -EAGAIN;
-	use_pool(vcc->dev,zatm_vcc->pool);
-	DPRINTK("pool %d\n",zatm_vcc->pool);
-	/* set up VC descriptor */
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zpokel(zatm_dev,zatm_vcc->pool << uPD98401_RXVC_POOL_SHIFT,
-	    chan*VC_SIZE/4);
-	zpokel(zatm_dev,uPD98401_RXVC_OD | (vcc->qos.aal == ATM_AAL5 ?
-	    uPD98401_RXVC_AR : 0) | cells,chan*VC_SIZE/4+1);
-	zpokel(zatm_dev,0,chan*VC_SIZE/4+2);
-	zatm_vcc->rx_chan = chan;
-	zatm_dev->rx_map[chan] = vcc;
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	return 0;
-}
-
-
-static int open_rx_second(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-	int pos,shift;
-
-	DPRINTK("open_rx_second (0x%x)\n",inb_p(0xc053));
-	zatm_dev = ZATM_DEV(vcc->dev);
-	zatm_vcc = ZATM_VCC(vcc);
-	if (!zatm_vcc->rx_chan) return 0;
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	/* should also handle VPI @@@ */
-	pos = vcc->vci >> 1;
-	shift = (1-(vcc->vci & 1)) << 4;
-	zpokel(zatm_dev,(zpeekl(zatm_dev,pos) & ~(0xffff << shift)) |
-	    ((zatm_vcc->rx_chan | uPD98401_RXLT_ENBL) << shift),pos);
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	return 0;
-}
-
-
-static void close_rx(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-	int pos,shift;
-
-	zatm_vcc = ZATM_VCC(vcc);
-	zatm_dev = ZATM_DEV(vcc->dev);
-	if (!zatm_vcc->rx_chan) return;
-	DPRINTK("close_rx\n");
-	/* disable receiver */
-	if (vcc->vpi != ATM_VPI_UNSPEC && vcc->vci != ATM_VCI_UNSPEC) {
-		spin_lock_irqsave(&zatm_dev->lock, flags);
-		pos = vcc->vci >> 1;
-		shift = (1-(vcc->vci & 1)) << 4;
-		zpokel(zatm_dev,zpeekl(zatm_dev,pos) & ~(0xffff << shift),pos);
-		zwait();
-		zout(uPD98401_NOP,CMR);
-		zwait();
-		zout(uPD98401_NOP,CMR);
-		spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	}
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zwait();
-	zout(uPD98401_DEACT_CHAN | uPD98401_CHAN_RT | (zatm_vcc->rx_chan <<
-	    uPD98401_CHAN_ADDR_SHIFT),CMR);
-	zwait();
-	udelay(10); /* why oh why ... ? */
-	zout(uPD98401_CLOSE_CHAN | uPD98401_CHAN_RT | (zatm_vcc->rx_chan <<
-	    uPD98401_CHAN_ADDR_SHIFT),CMR);
-	zwait();
-	if (!(zin(CMR) & uPD98401_CHAN_ADDR))
-		printk(KERN_CRIT DEV_LABEL "(itf %d): can't close RX channel "
-		    "%d\n",vcc->dev->number,zatm_vcc->rx_chan);
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	zatm_dev->rx_map[zatm_vcc->rx_chan] = NULL;
-	zatm_vcc->rx_chan = 0;
-	unuse_pool(vcc->dev,zatm_vcc->pool);
-}
-
-
-static int start_rx(struct atm_dev *dev)
-{
-	struct zatm_dev *zatm_dev;
-	int i;
-
-	DPRINTK("start_rx\n");
-	zatm_dev = ZATM_DEV(dev);
-	zatm_dev->rx_map = kcalloc(zatm_dev->chans,
-				   sizeof(*zatm_dev->rx_map),
-				   GFP_KERNEL);
-	if (!zatm_dev->rx_map) return -ENOMEM;
-	/* set VPI/VCI split (use all VCIs and give what's left to VPIs) */
-	zpokel(zatm_dev,(1 << dev->ci_range.vci_bits)-1,uPD98401_VRR);
-	/* prepare free buffer pools */
-	for (i = 0; i <= ZATM_LAST_POOL; i++) {
-		zatm_dev->pool_info[i].ref_count = 0;
-		zatm_dev->pool_info[i].rqa_count = 0;
-		zatm_dev->pool_info[i].rqu_count = 0;
-		zatm_dev->pool_info[i].low_water = LOW_MARK;
-		zatm_dev->pool_info[i].high_water = HIGH_MARK;
-		zatm_dev->pool_info[i].offset = 0;
-		zatm_dev->pool_info[i].next_off = 0;
-		zatm_dev->pool_info[i].next_cnt = 0;
-		zatm_dev->pool_info[i].next_thres = OFF_CNG_THRES;
-	}
-	return 0;
-}
-
-
-/*----------------------------------- TX ------------------------------------*/
-
-
-static int do_tx(struct sk_buff *skb)
-{
-	struct atm_vcc *vcc;
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	u32 *dsc;
-	unsigned long flags;
-
-	EVENT("do_tx\n",0,0);
-	DPRINTK("sending skb %p\n",skb);
-	vcc = ATM_SKB(skb)->vcc;
-	zatm_dev = ZATM_DEV(vcc->dev);
-	zatm_vcc = ZATM_VCC(vcc);
-	EVENT("iovcnt=%d\n",skb_shinfo(skb)->nr_frags,0);
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	if (!skb_shinfo(skb)->nr_frags) {
-		if (zatm_vcc->txing == RING_ENTRIES-1) {
-			spin_unlock_irqrestore(&zatm_dev->lock, flags);
-			return RING_BUSY;
-		}
-		zatm_vcc->txing++;
-		dsc = zatm_vcc->ring+zatm_vcc->ring_curr;
-		zatm_vcc->ring_curr = (zatm_vcc->ring_curr+RING_WORDS) &
-		    (RING_ENTRIES*RING_WORDS-1);
-		dsc[1] = 0;
-		dsc[2] = skb->len;
-		dsc[3] = virt_to_bus(skb->data);
-		mb();
-		dsc[0] = uPD98401_TXPD_V | uPD98401_TXPD_DP | uPD98401_TXPD_SM
-		    | (vcc->qos.aal == ATM_AAL5 ? uPD98401_TXPD_AAL5 : 0 |
-		    (ATM_SKB(skb)->atm_options & ATM_ATMOPT_CLP ?
-		    uPD98401_CLPM_1 : uPD98401_CLPM_0));
-		EVENT("dsc (0x%lx)\n",(unsigned long) dsc,0);
-	}
-	else {
-printk("NONONONOO!!!!\n");
-		dsc = NULL;
-#if 0
-		u32 *put;
-		int i;
-
-		dsc = kmalloc(uPD98401_TXPD_SIZE * 2 +
-			uPD98401_TXBD_SIZE * ATM_SKB(skb)->iovcnt, GFP_ATOMIC);
-		if (!dsc) {
-			if (vcc->pop)
-				vcc->pop(vcc, skb);
-			else
-				dev_kfree_skb_irq(skb);
-			return -EAGAIN;
-		}
-		/* @@@ should check alignment */
-		put = dsc+8;
-		dsc[0] = uPD98401_TXPD_V | uPD98401_TXPD_DP |
-		    (vcc->aal == ATM_AAL5 ? uPD98401_TXPD_AAL5 : 0 |
-		    (ATM_SKB(skb)->atm_options & ATM_ATMOPT_CLP ?
-		    uPD98401_CLPM_1 : uPD98401_CLPM_0));
-		dsc[1] = 0;
-		dsc[2] = ATM_SKB(skb)->iovcnt * uPD98401_TXBD_SIZE;
-		dsc[3] = virt_to_bus(put);
-		for (i = 0; i < ATM_SKB(skb)->iovcnt; i++) {
-			*put++ = ((struct iovec *) skb->data)[i].iov_len;
-			*put++ = virt_to_bus(((struct iovec *)
-			    skb->data)[i].iov_base);
-		}
-		put[-2] |= uPD98401_TXBD_LAST;
-#endif
-	}
-	ZATM_PRV_DSC(skb) = dsc;
-	skb_queue_tail(&zatm_vcc->tx_queue,skb);
-	DPRINTK("QRP=0x%08lx\n",zpeekl(zatm_dev,zatm_vcc->tx_chan*VC_SIZE/4+
-	  uPD98401_TXVC_QRP));
-	zwait();
-	zout(uPD98401_TX_READY | (zatm_vcc->tx_chan <<
-	    uPD98401_CHAN_ADDR_SHIFT),CMR);
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	EVENT("done\n",0,0);
-	return 0;
-}
-
-
-static inline void dequeue_tx(struct atm_vcc *vcc)
-{
-	struct zatm_vcc *zatm_vcc;
-	struct sk_buff *skb;
-
-	EVENT("dequeue_tx\n",0,0);
-	zatm_vcc = ZATM_VCC(vcc);
-	skb = skb_dequeue(&zatm_vcc->tx_queue);
-	if (!skb) {
-		printk(KERN_CRIT DEV_LABEL "(itf %d): dequeue_tx but not "
-		    "txing\n",vcc->dev->number);
-		return;
-	}
-#if 0 /* @@@ would fail on CLP */
-if (*ZATM_PRV_DSC(skb) != (uPD98401_TXPD_V | uPD98401_TXPD_DP |
-  uPD98401_TXPD_SM | uPD98401_TXPD_AAL5)) printk("@#*$!!!!  (%08x)\n",
-  *ZATM_PRV_DSC(skb));
-#endif
-	*ZATM_PRV_DSC(skb) = 0; /* mark as invalid */
-	zatm_vcc->txing--;
-	if (vcc->pop) vcc->pop(vcc,skb);
-	else dev_kfree_skb_irq(skb);
-	while ((skb = skb_dequeue(&zatm_vcc->backlog)))
-		if (do_tx(skb) == RING_BUSY) {
-			skb_queue_head(&zatm_vcc->backlog,skb);
-			break;
-		}
-	atomic_inc(&vcc->stats->tx);
-	wake_up(&zatm_vcc->tx_wait);
-}
-
-
-static void poll_tx(struct atm_dev *dev,int mbx)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long pos;
-	u32 x;
-
-	EVENT("poll_tx\n",0,0);
-	zatm_dev = ZATM_DEV(dev);
-	pos = (zatm_dev->mbx_start[mbx] & ~0xffffUL) | zin(MTA(mbx));
-	while (x = zin(MWA(mbx)), (pos & 0xffff) != x) {
-		int chan;
-
-#if 1
-		u32 data,*addr;
-
-		EVENT("MBX: host 0x%lx, nic 0x%x\n",pos,x);
-		addr = (u32 *) pos;
-		data = *addr;
-		chan = (data & uPD98401_TXI_CONN) >> uPD98401_TXI_CONN_SHIFT;
-		EVENT("addr = 0x%lx, data = 0x%08x,",(unsigned long) addr,
-		    data);
-		EVENT("chan = %d\n",chan,0);
-#else
-NO !
-		chan = (zatm_dev->mbx_start[mbx][pos >> 2] & uPD98401_TXI_CONN)
-		>> uPD98401_TXI_CONN_SHIFT;
-#endif
-		if (chan < zatm_dev->chans && zatm_dev->tx_map[chan])
-			dequeue_tx(zatm_dev->tx_map[chan]);
-		else {
-			printk(KERN_CRIT DEV_LABEL "(itf %d): TX indication "
-			    "for non-existing channel %d\n",dev->number,chan);
-			event_dump();
-		}
-		if (((pos += 4) & 0xffff) == zatm_dev->mbx_end[mbx])
-			pos = zatm_dev->mbx_start[mbx];
-	}
-	zout(pos & 0xffff,MTA(mbx));
-}
-
-
-/*
- * BUG BUG BUG: Doesn't handle "new-style" rate specification yet.
- */
-
-static int alloc_shaper(struct atm_dev *dev,int *pcr,int min,int max,int ubr)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long flags;
-	unsigned long i,m,c;
-	int shaper;
-
-	DPRINTK("alloc_shaper (min = %d, max = %d)\n",min,max);
-	zatm_dev = ZATM_DEV(dev);
-	if (!zatm_dev->free_shapers) return -EAGAIN;
-	for (shaper = 0; !((zatm_dev->free_shapers >> shaper) & 1); shaper++);
-	zatm_dev->free_shapers &= ~1 << shaper;
-	if (ubr) {
-		c = 5;
-		i = m = 1;
-		zatm_dev->ubr_ref_cnt++;
-		zatm_dev->ubr = shaper;
-		*pcr = 0;
-	}
-	else {
-		if (min) {
-			if (min <= 255) {
-				i = min;
-				m = ATM_OC3_PCR;
-			}
-			else {
-				i = 255;
-				m = ATM_OC3_PCR*255/min;
-			}
-		}
-		else {
-			if (max > zatm_dev->tx_bw) max = zatm_dev->tx_bw;
-			if (max <= 255) {
-				i = max;
-				m = ATM_OC3_PCR;
-			}
-			else {
-				i = 255;
-				m = DIV_ROUND_UP(ATM_OC3_PCR*255, max);
-			}
-		}
-		if (i > m) {
-			printk(KERN_CRIT DEV_LABEL "shaper algorithm botched "
-			    "[%d,%d] -> i=%ld,m=%ld\n",min,max,i,m);
-			m = i;
-		}
-		*pcr = i*ATM_OC3_PCR/m;
-		c = 20; /* @@@ should use max_cdv ! */
-		if ((min && *pcr < min) || (max && *pcr > max)) return -EINVAL;
-		if (zatm_dev->tx_bw < *pcr) return -EAGAIN;
-		zatm_dev->tx_bw -= *pcr;
-	}
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	DPRINTK("i = %d, m = %d, PCR = %d\n",i,m,*pcr);
-	zpokel(zatm_dev,(i << uPD98401_IM_I_SHIFT) | m,uPD98401_IM(shaper));
-	zpokel(zatm_dev,c << uPD98401_PC_C_SHIFT,uPD98401_PC(shaper));
-	zpokel(zatm_dev,0,uPD98401_X(shaper));
-	zpokel(zatm_dev,0,uPD98401_Y(shaper));
-	zpokel(zatm_dev,uPD98401_PS_E,uPD98401_PS(shaper));
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	return shaper;
-}
-
-
-static void dealloc_shaper(struct atm_dev *dev,int shaper)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long flags;
-
-	zatm_dev = ZATM_DEV(dev);
-	if (shaper == zatm_dev->ubr) {
-		if (--zatm_dev->ubr_ref_cnt) return;
-		zatm_dev->ubr = -1;
-	}
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zpokel(zatm_dev,zpeekl(zatm_dev,uPD98401_PS(shaper)) & ~uPD98401_PS_E,
-	    uPD98401_PS(shaper));
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	zatm_dev->free_shapers |= 1 << shaper;
-}
-
-
-static void close_tx(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-	int chan;
-
-	zatm_vcc = ZATM_VCC(vcc);
-	zatm_dev = ZATM_DEV(vcc->dev);
-	chan = zatm_vcc->tx_chan;
-	if (!chan) return;
-	DPRINTK("close_tx\n");
-	if (skb_peek(&zatm_vcc->backlog)) {
-		printk("waiting for backlog to drain ...\n");
-		event_dump();
-		wait_event(zatm_vcc->tx_wait, !skb_peek(&zatm_vcc->backlog));
-	}
-	if (skb_peek(&zatm_vcc->tx_queue)) {
-		printk("waiting for TX queue to drain ...\n");
-		event_dump();
-		wait_event(zatm_vcc->tx_wait, !skb_peek(&zatm_vcc->tx_queue));
-	}
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-#if 0
-	zwait();
-	zout(uPD98401_DEACT_CHAN | (chan << uPD98401_CHAN_ADDR_SHIFT),CMR);
-#endif
-	zwait();
-	zout(uPD98401_CLOSE_CHAN | (chan << uPD98401_CHAN_ADDR_SHIFT),CMR);
-	zwait();
-	if (!(zin(CMR) & uPD98401_CHAN_ADDR))
-		printk(KERN_CRIT DEV_LABEL "(itf %d): can't close TX channel "
-		    "%d\n",vcc->dev->number,chan);
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	zatm_vcc->tx_chan = 0;
-	zatm_dev->tx_map[chan] = NULL;
-	if (zatm_vcc->shaper != zatm_dev->ubr) {
-		zatm_dev->tx_bw += vcc->qos.txtp.min_pcr;
-		dealloc_shaper(vcc->dev,zatm_vcc->shaper);
-	}
-	kfree(zatm_vcc->ring);
-}
-
-
-static int open_tx_first(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-	u32 *loop;
-	unsigned short chan;
-	int unlimited;
-
-	DPRINTK("open_tx_first\n");
-	zatm_dev = ZATM_DEV(vcc->dev);
-	zatm_vcc = ZATM_VCC(vcc);
-	zatm_vcc->tx_chan = 0;
-	if (vcc->qos.txtp.traffic_class == ATM_NONE) return 0;
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zwait();
-	zout(uPD98401_OPEN_CHAN,CMR);
-	zwait();
-	DPRINTK("0x%x 0x%x\n",zin(CMR),zin(CER));
-	chan = (zin(CMR) & uPD98401_CHAN_ADDR) >> uPD98401_CHAN_ADDR_SHIFT;
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	DPRINTK("chan is %d\n",chan);
-	if (!chan) return -EAGAIN;
-	unlimited = vcc->qos.txtp.traffic_class == ATM_UBR &&
-	    (!vcc->qos.txtp.max_pcr || vcc->qos.txtp.max_pcr == ATM_MAX_PCR ||
-	    vcc->qos.txtp.max_pcr >= ATM_OC3_PCR);
-	if (unlimited && zatm_dev->ubr != -1) zatm_vcc->shaper = zatm_dev->ubr;
-	else {
-		int pcr;
-
-		if (unlimited) vcc->qos.txtp.max_sdu = ATM_MAX_AAL5_PDU;
-		if ((zatm_vcc->shaper = alloc_shaper(vcc->dev,&pcr,
-		    vcc->qos.txtp.min_pcr,vcc->qos.txtp.max_pcr,unlimited))
-		    < 0) {
-			close_tx(vcc);
-			return zatm_vcc->shaper;
-		}
-		if (pcr > ATM_OC3_PCR) pcr = ATM_OC3_PCR;
-		vcc->qos.txtp.min_pcr = vcc->qos.txtp.max_pcr = pcr;
-	}
-	zatm_vcc->tx_chan = chan;
-	skb_queue_head_init(&zatm_vcc->tx_queue);
-	init_waitqueue_head(&zatm_vcc->tx_wait);
-	/* initialize ring */
-	zatm_vcc->ring = kzalloc(RING_SIZE,GFP_KERNEL);
-	if (!zatm_vcc->ring) return -ENOMEM;
-	loop = zatm_vcc->ring+RING_ENTRIES*RING_WORDS;
-	loop[0] = uPD98401_TXPD_V;
-	loop[1] = loop[2] = 0;
-	loop[3] = virt_to_bus(zatm_vcc->ring);
-	zatm_vcc->ring_curr = 0;
-	zatm_vcc->txing = 0;
-	skb_queue_head_init(&zatm_vcc->backlog);
-	zpokel(zatm_dev,virt_to_bus(zatm_vcc->ring),
-	    chan*VC_SIZE/4+uPD98401_TXVC_QRP);
-	return 0;
-}
-
-
-static int open_tx_second(struct atm_vcc *vcc)
-{
-	struct zatm_dev *zatm_dev;
-	struct zatm_vcc *zatm_vcc;
-	unsigned long flags;
-
-	DPRINTK("open_tx_second\n");
-	zatm_dev = ZATM_DEV(vcc->dev);
-	zatm_vcc = ZATM_VCC(vcc);
-	if (!zatm_vcc->tx_chan) return 0;
-	/* set up VC descriptor */
-	spin_lock_irqsave(&zatm_dev->lock, flags);
-	zpokel(zatm_dev,0,zatm_vcc->tx_chan*VC_SIZE/4);
-	zpokel(zatm_dev,uPD98401_TXVC_L | (zatm_vcc->shaper <<
-	    uPD98401_TXVC_SHP_SHIFT) | (vcc->vpi << uPD98401_TXVC_VPI_SHIFT) |
-	    vcc->vci,zatm_vcc->tx_chan*VC_SIZE/4+1);
-	zpokel(zatm_dev,0,zatm_vcc->tx_chan*VC_SIZE/4+2);
-	spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	zatm_dev->tx_map[zatm_vcc->tx_chan] = vcc;
-	return 0;
-}
-
-
-static int start_tx(struct atm_dev *dev)
-{
-	struct zatm_dev *zatm_dev;
-	int i;
-
-	DPRINTK("start_tx\n");
-	zatm_dev = ZATM_DEV(dev);
-	zatm_dev->tx_map = kmalloc_array(zatm_dev->chans,
-					 sizeof(*zatm_dev->tx_map),
-					 GFP_KERNEL);
-	if (!zatm_dev->tx_map) return -ENOMEM;
-	zatm_dev->tx_bw = ATM_OC3_PCR;
-	zatm_dev->free_shapers = (1 << NR_SHAPERS)-1;
-	zatm_dev->ubr = -1;
-	zatm_dev->ubr_ref_cnt = 0;
-	/* initialize shapers */
-	for (i = 0; i < NR_SHAPERS; i++) zpokel(zatm_dev,0,uPD98401_PS(i));
-	return 0;
-}
-
-
-/*------------------------------- interrupts --------------------------------*/
-
-
-static irqreturn_t zatm_int(int irq,void *dev_id)
-{
-	struct atm_dev *dev;
-	struct zatm_dev *zatm_dev;
-	u32 reason;
-	int handled = 0;
-
-	dev = dev_id;
-	zatm_dev = ZATM_DEV(dev);
-	while ((reason = zin(GSR))) {
-		handled = 1;
-		EVENT("reason 0x%x\n",reason,0);
-		if (reason & uPD98401_INT_PI) {
-			EVENT("PHY int\n",0,0);
-			dev->phy->interrupt(dev);
-		}
-		if (reason & uPD98401_INT_RQA) {
-			unsigned long pools;
-			int i;
-
-			pools = zin(RQA);
-			EVENT("RQA (0x%08x)\n",pools,0);
-			for (i = 0; pools; i++) {
-				if (pools & 1) {
-					refill_pool(dev,i);
-					zatm_dev->pool_info[i].rqa_count++;
-				}
-				pools >>= 1;
-			}
-		}
-		if (reason & uPD98401_INT_RQU) {
-			unsigned long pools;
-			int i;
-			pools = zin(RQU);
-			printk(KERN_WARNING DEV_LABEL "(itf %d): RQU 0x%08lx\n",
-			    dev->number,pools);
-			event_dump();
-			for (i = 0; pools; i++) {
-				if (pools & 1) {
-					refill_pool(dev,i);
-					zatm_dev->pool_info[i].rqu_count++;
-				}
-				pools >>= 1;
-			}
-		}
-		/* don't handle RD */
-		if (reason & uPD98401_INT_SPE)
-			printk(KERN_ALERT DEV_LABEL "(itf %d): system parity "
-			    "error at 0x%08x\n",dev->number,zin(ADDR));
-		if (reason & uPD98401_INT_CPE)
-			printk(KERN_ALERT DEV_LABEL "(itf %d): control memory "
-			    "parity error at 0x%08x\n",dev->number,zin(ADDR));
-		if (reason & uPD98401_INT_SBE) {
-			printk(KERN_ALERT DEV_LABEL "(itf %d): system bus "
-			    "error at 0x%08x\n",dev->number,zin(ADDR));
-			event_dump();
-		}
-		/* don't handle IND */
-		if (reason & uPD98401_INT_MF) {
-			printk(KERN_CRIT DEV_LABEL "(itf %d): mailbox full "
-			    "(0x%x)\n",dev->number,(reason & uPD98401_INT_MF)
-			    >> uPD98401_INT_MF_SHIFT);
-			event_dump();
-			    /* @@@ should try to recover */
-		}
-		if (reason & uPD98401_INT_MM) {
-			if (reason & 1) poll_rx(dev,0);
-			if (reason & 2) poll_rx(dev,1);
-			if (reason & 4) poll_tx(dev,2);
-			if (reason & 8) poll_tx(dev,3);
-		}
-		/* @@@ handle RCRn */
-	}
-	return IRQ_RETVAL(handled);
-}
-
-
-/*----------------------------- (E)EPROM access -----------------------------*/
-
-
-static void eprom_set(struct zatm_dev *zatm_dev, unsigned long value,
-		      unsigned short cmd)
-{
-	int error;
-
-	if ((error = pci_write_config_dword(zatm_dev->pci_dev,cmd,value)))
-		printk(KERN_ERR DEV_LABEL ": PCI write failed (0x%02x)\n",
-		    error);
-}
-
-
-static unsigned long eprom_get(struct zatm_dev *zatm_dev, unsigned short cmd)
-{
-	unsigned int value;
-	int error;
-
-	if ((error = pci_read_config_dword(zatm_dev->pci_dev,cmd,&value)))
-		printk(KERN_ERR DEV_LABEL ": PCI read failed (0x%02x)\n",
-		    error);
-	return value;
-}
-
-
-static void eprom_put_bits(struct zatm_dev *zatm_dev, unsigned long data,
-			   int bits, unsigned short cmd)
-{
-	unsigned long value;
-	int i;
-
-	for (i = bits-1; i >= 0; i--) {
-		value = ZEPROM_CS | (((data >> i) & 1) ? ZEPROM_DI : 0);
-		eprom_set(zatm_dev,value,cmd);
-		eprom_set(zatm_dev,value | ZEPROM_SK,cmd);
-		eprom_set(zatm_dev,value,cmd);
-	}
-}
-
-
-static void eprom_get_byte(struct zatm_dev *zatm_dev, unsigned char *byte,
-			   unsigned short cmd)
-{
-	int i;
-
-	*byte = 0;
-	for (i = 8; i; i--) {
-		eprom_set(zatm_dev,ZEPROM_CS,cmd);
-		eprom_set(zatm_dev,ZEPROM_CS | ZEPROM_SK,cmd);
-		*byte <<= 1;
-		if (eprom_get(zatm_dev,cmd) & ZEPROM_DO) *byte |= 1;
-		eprom_set(zatm_dev,ZEPROM_CS,cmd);
-	}
-}
-
-
-static int eprom_try_esi(struct atm_dev *dev, unsigned short cmd, int offset,
-			 int swap)
-{
-	unsigned char buf[ZEPROM_SIZE];
-	struct zatm_dev *zatm_dev;
-	int i;
-
-	zatm_dev = ZATM_DEV(dev);
-	for (i = 0; i < ZEPROM_SIZE; i += 2) {
-		eprom_set(zatm_dev,ZEPROM_CS,cmd); /* select EPROM */
-		eprom_put_bits(zatm_dev,ZEPROM_CMD_READ,ZEPROM_CMD_LEN,cmd);
-		eprom_put_bits(zatm_dev,i >> 1,ZEPROM_ADDR_LEN,cmd);
-		eprom_get_byte(zatm_dev,buf+i+swap,cmd);
-		eprom_get_byte(zatm_dev,buf+i+1-swap,cmd);
-		eprom_set(zatm_dev,0,cmd); /* deselect EPROM */
-	}
-	memcpy(dev->esi,buf+offset,ESI_LEN);
-	return memcmp(dev->esi,"\0\0\0\0\0",ESI_LEN); /* assumes ESI_LEN == 6 */
-}
-
-
-static void eprom_get_esi(struct atm_dev *dev)
-{
-	if (eprom_try_esi(dev,ZEPROM_V1_REG,ZEPROM_V1_ESI_OFF,1)) return;
-	(void) eprom_try_esi(dev,ZEPROM_V2_REG,ZEPROM_V2_ESI_OFF,0);
-}
-
-
-/*--------------------------------- entries ---------------------------------*/
-
-
-static int zatm_init(struct atm_dev *dev)
-{
-	struct zatm_dev *zatm_dev;
-	struct pci_dev *pci_dev;
-	unsigned short command;
-	int error,i,last;
-	unsigned long t0,t1,t2;
-
-	DPRINTK(">zatm_init\n");
-	zatm_dev = ZATM_DEV(dev);
-	spin_lock_init(&zatm_dev->lock);
-	pci_dev = zatm_dev->pci_dev;
-	zatm_dev->base = pci_resource_start(pci_dev, 0);
-	zatm_dev->irq = pci_dev->irq;
-	if ((error = pci_read_config_word(pci_dev,PCI_COMMAND,&command))) {
-		printk(KERN_ERR DEV_LABEL "(itf %d): init error 0x%02x\n",
-		    dev->number,error);
-		return -EINVAL;
-	}
-	if ((error = pci_write_config_word(pci_dev,PCI_COMMAND,
-	    command | PCI_COMMAND_IO | PCI_COMMAND_MASTER))) {
-		printk(KERN_ERR DEV_LABEL "(itf %d): can't enable IO (0x%02x)"
-		    "\n",dev->number,error);
-		return -EIO;
-	}
-	eprom_get_esi(dev);
-	printk(KERN_NOTICE DEV_LABEL "(itf %d): rev.%d,base=0x%x,irq=%d,",
-	    dev->number,pci_dev->revision,zatm_dev->base,zatm_dev->irq);
-	/* reset uPD98401 */
-	zout(0,SWR);
-	while (!(zin(GSR) & uPD98401_INT_IND));
-	zout(uPD98401_GMR_ONE /*uPD98401_BURST4*/,GMR);
-	last = MAX_CRAM_SIZE;
-	for (i = last-RAM_INCREMENT; i >= 0; i -= RAM_INCREMENT) {
-		zpokel(zatm_dev,0x55555555,i);
-		if (zpeekl(zatm_dev,i) != 0x55555555) last = i;
-		else {
-			zpokel(zatm_dev,0xAAAAAAAA,i);
-			if (zpeekl(zatm_dev,i) != 0xAAAAAAAA) last = i;
-			else zpokel(zatm_dev,i,i);
-		}
-	}
-	for (i = 0; i < last; i += RAM_INCREMENT)
-		if (zpeekl(zatm_dev,i) != i) break;
-	zatm_dev->mem = i << 2;
-	while (i) zpokel(zatm_dev,0,--i);
-	/* reset again to rebuild memory pointers */
-	zout(0,SWR);
-	while (!(zin(GSR) & uPD98401_INT_IND));
-	zout(uPD98401_GMR_ONE | uPD98401_BURST8 | uPD98401_BURST4 |
-	    uPD98401_BURST2 | uPD98401_GMR_PM | uPD98401_GMR_DR,GMR);
-	/* TODO: should shrink allocation now */
-	printk("mem=%dkB,%s (",zatm_dev->mem >> 10,zatm_dev->copper ? "UTP" :
-	    "MMF");
-	for (i = 0; i < ESI_LEN; i++)
-		printk("%02X%s",dev->esi[i],i == ESI_LEN-1 ? ")\n" : "-");
-	do {
-		unsigned long flags;
-
-		spin_lock_irqsave(&zatm_dev->lock, flags);
-		t0 = zpeekl(zatm_dev,uPD98401_TSR);
-		udelay(10);
-		t1 = zpeekl(zatm_dev,uPD98401_TSR);
-		udelay(1010);
-		t2 = zpeekl(zatm_dev,uPD98401_TSR);
-		spin_unlock_irqrestore(&zatm_dev->lock, flags);
-	}
-	while (t0 > t1 || t1 > t2); /* loop if wrapping ... */
-	zatm_dev->khz = t2-2*t1+t0;
-	printk(KERN_NOTICE DEV_LABEL "(itf %d): uPD98401 %d.%d at %d.%03d "
-	    "MHz\n",dev->number,
-	    (zin(VER) & uPD98401_MAJOR) >> uPD98401_MAJOR_SHIFT,
-            zin(VER) & uPD98401_MINOR,zatm_dev->khz/1000,zatm_dev->khz % 1000);
-	return uPD98402_init(dev);
-}
-
-
-static int zatm_start(struct atm_dev *dev)
-{
-	struct zatm_dev *zatm_dev = ZATM_DEV(dev);
-	struct pci_dev *pdev = zatm_dev->pci_dev;
-	unsigned long curr;
-	int pools,vccs,rx;
-	int error, i, ld;
-
-	DPRINTK("zatm_start\n");
-	zatm_dev->rx_map = zatm_dev->tx_map = NULL;
- 	for (i = 0; i < NR_MBX; i++)
- 		zatm_dev->mbx_start[i] = 0;
- 	error = request_irq(zatm_dev->irq, zatm_int, IRQF_SHARED, DEV_LABEL, dev);
-	if (error < 0) {
- 		printk(KERN_ERR DEV_LABEL "(itf %d): IRQ%d is already in use\n",
- 		    dev->number,zatm_dev->irq);
-		goto done;
-	}
-	/* define memory regions */
-	pools = NR_POOLS;
-	if (NR_SHAPERS*SHAPER_SIZE > pools*POOL_SIZE)
-		pools = NR_SHAPERS*SHAPER_SIZE/POOL_SIZE;
-	vccs = (zatm_dev->mem-NR_SHAPERS*SHAPER_SIZE-pools*POOL_SIZE)/
-	    (2*VC_SIZE+RX_SIZE);
-	ld = -1;
-	for (rx = 1; rx < vccs; rx <<= 1) ld++;
-	dev->ci_range.vpi_bits = 0; /* @@@ no VPI for now */
-	dev->ci_range.vci_bits = ld;
-	dev->link_rate = ATM_OC3_PCR;
-	zatm_dev->chans = vccs; /* ??? */
-	curr = rx*RX_SIZE/4;
-	DPRINTK("RX pool 0x%08lx\n",curr);
-	zpokel(zatm_dev,curr,uPD98401_PMA); /* receive pool */
-	zatm_dev->pool_base = curr;
-	curr += pools*POOL_SIZE/4;
-	DPRINTK("Shapers 0x%08lx\n",curr);
-	zpokel(zatm_dev,curr,uPD98401_SMA); /* shapers */
-	curr += NR_SHAPERS*SHAPER_SIZE/4;
-	DPRINTK("Free    0x%08lx\n",curr);
-	zpokel(zatm_dev,curr,uPD98401_TOS); /* free pool */
-	printk(KERN_INFO DEV_LABEL "(itf %d): %d shapers, %d pools, %d RX, "
-	    "%ld VCs\n",dev->number,NR_SHAPERS,pools,rx,
-	    (zatm_dev->mem-curr*4)/VC_SIZE);
-	/* create mailboxes */
-	for (i = 0; i < NR_MBX; i++) {
-		void *mbx;
-		dma_addr_t mbx_dma;
-
-		if (!mbx_entries[i])
-			continue;
-		mbx = dma_alloc_coherent(&pdev->dev,
-					 2 * MBX_SIZE(i), &mbx_dma, GFP_KERNEL);
-		if (!mbx) {
-			error = -ENOMEM;
-			goto out;
-		}
-		/*
-		 * Alignment provided by dma_alloc_coherent() isn't enough
-		 * for this device.
-		 */
-		if (((unsigned long)mbx ^ mbx_dma) & 0xffff) {
-			printk(KERN_ERR DEV_LABEL "(itf %d): system "
-			       "bus incompatible with driver\n", dev->number);
-			dma_free_coherent(&pdev->dev, 2*MBX_SIZE(i), mbx, mbx_dma);
-			error = -ENODEV;
-			goto out;
-		}
-		DPRINTK("mbx@0x%08lx-0x%08lx\n", mbx, mbx + MBX_SIZE(i));
-		zatm_dev->mbx_start[i] = (unsigned long)mbx;
-		zatm_dev->mbx_dma[i] = mbx_dma;
-		zatm_dev->mbx_end[i] = (zatm_dev->mbx_start[i] + MBX_SIZE(i)) &
-					0xffff;
-		zout(mbx_dma >> 16, MSH(i));
-		zout(mbx_dma, MSL(i));
-		zout(zatm_dev->mbx_end[i], MBA(i));
-		zout((unsigned long)mbx & 0xffff, MTA(i));
-		zout((unsigned long)mbx & 0xffff, MWA(i));
-	}
-	error = start_tx(dev);
-	if (error)
-		goto out;
-	error = start_rx(dev);
-	if (error)
-		goto out_tx;
-	error = dev->phy->start(dev);
-	if (error)
-		goto out_rx;
-	zout(0xffffffff,IMR); /* enable interrupts */
-	/* enable TX & RX */
-	zout(zin(GMR) | uPD98401_GMR_SE | uPD98401_GMR_RE,GMR);
-done:
-	return error;
-
-out_rx:
-	kfree(zatm_dev->rx_map);
-out_tx:
-	kfree(zatm_dev->tx_map);
-out:
-	while (i-- > 0) {
-		dma_free_coherent(&pdev->dev, 2 * MBX_SIZE(i),
-				  (void *)zatm_dev->mbx_start[i],
-				  zatm_dev->mbx_dma[i]);
-	}
-	free_irq(zatm_dev->irq, dev);
-	goto done;
-}
-
-
-static void zatm_close(struct atm_vcc *vcc)
-{
-        DPRINTK(">zatm_close\n");
-        if (!ZATM_VCC(vcc)) return;
-	clear_bit(ATM_VF_READY,&vcc->flags);
-        close_rx(vcc);
-	EVENT("close_tx\n",0,0);
-        close_tx(vcc);
-        DPRINTK("zatm_close: done waiting\n");
-        /* deallocate memory */
-        kfree(ZATM_VCC(vcc));
-	vcc->dev_data = NULL;
-	clear_bit(ATM_VF_ADDR,&vcc->flags);
-}
-
-
-static int zatm_open(struct atm_vcc *vcc)
-{
-	struct zatm_vcc *zatm_vcc;
-	short vpi = vcc->vpi;
-	int vci = vcc->vci;
-	int error;
-
-	DPRINTK(">zatm_open\n");
-	if (!test_bit(ATM_VF_PARTIAL,&vcc->flags))
-		vcc->dev_data = NULL;
-	if (vci != ATM_VPI_UNSPEC && vpi != ATM_VCI_UNSPEC)
-		set_bit(ATM_VF_ADDR,&vcc->flags);
-	if (vcc->qos.aal != ATM_AAL5) return -EINVAL; /* @@@ AAL0 */
-	DPRINTK(DEV_LABEL "(itf %d): open %d.%d\n",vcc->dev->number,vcc->vpi,
-	    vcc->vci);
-	if (!test_bit(ATM_VF_PARTIAL,&vcc->flags)) {
-		zatm_vcc = kmalloc(sizeof(*zatm_vcc), GFP_KERNEL);
-		if (!zatm_vcc) {
-			clear_bit(ATM_VF_ADDR,&vcc->flags);
-			return -ENOMEM;
-		}
-		vcc->dev_data = zatm_vcc;
-		ZATM_VCC(vcc)->tx_chan = 0; /* for zatm_close after open_rx */
-		if ((error = open_rx_first(vcc))) {
-	                zatm_close(vcc);
-	                return error;
-	        }
-		if ((error = open_tx_first(vcc))) {
-			zatm_close(vcc);
-			return error;
-	        }
-	}
-	if (vci == ATM_VPI_UNSPEC || vpi == ATM_VCI_UNSPEC) return 0;
-	if ((error = open_rx_second(vcc))) {
-		zatm_close(vcc);
-		return error;
-        }
-	if ((error = open_tx_second(vcc))) {
-		zatm_close(vcc);
-		return error;
-        }
-	set_bit(ATM_VF_READY,&vcc->flags);
-        return 0;
-}
-
-
-static int zatm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos,int flags)
-{
-	printk("Not yet implemented\n");
-	return -ENOSYS;
-	/* @@@ */
-}
-
-
-static int zatm_ioctl(struct atm_dev *dev,unsigned int cmd,void __user *arg)
-{
-	struct zatm_dev *zatm_dev;
-	unsigned long flags;
-
-	zatm_dev = ZATM_DEV(dev);
-	switch (cmd) {
-		case ZATM_GETPOOLZ:
-			if (!capable(CAP_NET_ADMIN)) return -EPERM;
-			fallthrough;
-		case ZATM_GETPOOL:
-			{
-				struct zatm_pool_info info;
-				int pool;
-
-				if (get_user(pool,
-				    &((struct zatm_pool_req __user *) arg)->pool_num))
-					return -EFAULT;
-				if (pool < 0 || pool > ZATM_LAST_POOL)
-					return -EINVAL;
-				pool = array_index_nospec(pool,
-							  ZATM_LAST_POOL + 1);
-				spin_lock_irqsave(&zatm_dev->lock, flags);
-				info = zatm_dev->pool_info[pool];
-				if (cmd == ZATM_GETPOOLZ) {
-					zatm_dev->pool_info[pool].rqa_count = 0;
-					zatm_dev->pool_info[pool].rqu_count = 0;
-				}
-				spin_unlock_irqrestore(&zatm_dev->lock, flags);
-				return copy_to_user(
-				    &((struct zatm_pool_req __user *) arg)->info,
-				    &info,sizeof(info)) ? -EFAULT : 0;
-			}
-		case ZATM_SETPOOL:
-			{
-				struct zatm_pool_info info;
-				int pool;
-
-				if (!capable(CAP_NET_ADMIN)) return -EPERM;
-				if (get_user(pool,
-				    &((struct zatm_pool_req __user *) arg)->pool_num))
-					return -EFAULT;
-				if (pool < 0 || pool > ZATM_LAST_POOL)
-					return -EINVAL;
-				pool = array_index_nospec(pool,
-							  ZATM_LAST_POOL + 1);
-				if (copy_from_user(&info,
-				    &((struct zatm_pool_req __user *) arg)->info,
-				    sizeof(info))) return -EFAULT;
-				if (!info.low_water)
-					info.low_water = zatm_dev->
-					    pool_info[pool].low_water;
-				if (!info.high_water)
-					info.high_water = zatm_dev->
-					    pool_info[pool].high_water;
-				if (!info.next_thres)
-					info.next_thres = zatm_dev->
-					    pool_info[pool].next_thres;
-				if (info.low_water >= info.high_water ||
-				    info.low_water < 0)
-					return -EINVAL;
-				spin_lock_irqsave(&zatm_dev->lock, flags);
-				zatm_dev->pool_info[pool].low_water =
-				    info.low_water;
-				zatm_dev->pool_info[pool].high_water =
-				    info.high_water;
-				zatm_dev->pool_info[pool].next_thres =
-				    info.next_thres;
-				spin_unlock_irqrestore(&zatm_dev->lock, flags);
-				return 0;
-			}
-		default:
-        		if (!dev->phy->ioctl) return -ENOIOCTLCMD;
-		        return dev->phy->ioctl(dev,cmd,arg);
-	}
-}
-
-static int zatm_send(struct atm_vcc *vcc,struct sk_buff *skb)
-{
-	int error;
-
-	EVENT(">zatm_send 0x%lx\n",(unsigned long) skb,0);
-	if (!ZATM_VCC(vcc)->tx_chan || !test_bit(ATM_VF_READY,&vcc->flags)) {
-		if (vcc->pop) vcc->pop(vcc,skb);
-		else dev_kfree_skb(skb);
-		return -EINVAL;
-	}
-	if (!skb) {
-		printk(KERN_CRIT "!skb in zatm_send ?\n");
-		if (vcc->pop) vcc->pop(vcc,skb);
-		return -EINVAL;
-	}
-	ATM_SKB(skb)->vcc = vcc;
-	error = do_tx(skb);
-	if (error != RING_BUSY) return error;
-	skb_queue_tail(&ZATM_VCC(vcc)->backlog,skb);
-	return 0;
-}
-
-
-static void zatm_phy_put(struct atm_dev *dev,unsigned char value,
-    unsigned long addr)
-{
-	struct zatm_dev *zatm_dev;
-
-	zatm_dev = ZATM_DEV(dev);
-	zwait();
-	zout(value,CER);
-	zout(uPD98401_IND_ACC | uPD98401_IA_B0 |
-	    (uPD98401_IA_TGT_PHY << uPD98401_IA_TGT_SHIFT) | addr,CMR);
-}
-
-
-static unsigned char zatm_phy_get(struct atm_dev *dev,unsigned long addr)
-{
-	struct zatm_dev *zatm_dev;
-
-	zatm_dev = ZATM_DEV(dev);
-	zwait();
-	zout(uPD98401_IND_ACC | uPD98401_IA_B0 | uPD98401_IA_RW |
-	  (uPD98401_IA_TGT_PHY << uPD98401_IA_TGT_SHIFT) | addr,CMR);
-	zwait();
-	return zin(CER) & 0xff;
-}
-
-
-static const struct atmdev_ops ops = {
-	.open		= zatm_open,
-	.close		= zatm_close,
-	.ioctl		= zatm_ioctl,
-	.send		= zatm_send,
-	.phy_put	= zatm_phy_put,
-	.phy_get	= zatm_phy_get,
-	.change_qos	= zatm_change_qos,
-};
-
-static int zatm_init_one(struct pci_dev *pci_dev,
-			 const struct pci_device_id *ent)
-{
-	struct atm_dev *dev;
-	struct zatm_dev *zatm_dev;
-	int ret = -ENOMEM;
-
-	zatm_dev = kmalloc(sizeof(*zatm_dev), GFP_KERNEL);
-	if (!zatm_dev) {
-		printk(KERN_EMERG "%s: memory shortage\n", DEV_LABEL);
-		goto out;
-	}
-
-	dev = atm_dev_register(DEV_LABEL, &pci_dev->dev, &ops, -1, NULL);
-	if (!dev)
-		goto out_free;
-
-	ret = pci_enable_device(pci_dev);
-	if (ret < 0)
-		goto out_deregister;
-
-	ret = pci_request_regions(pci_dev, DEV_LABEL);
-	if (ret < 0)
-		goto out_disable;
-
-	ret = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32));
-	if (ret < 0)
-		goto out_release;
-
-	zatm_dev->pci_dev = pci_dev;
-	dev->dev_data = zatm_dev;
-	zatm_dev->copper = (int)ent->driver_data;
-	if ((ret = zatm_init(dev)) || (ret = zatm_start(dev)))
-		goto out_release;
-
-	pci_set_drvdata(pci_dev, dev);
-	zatm_dev->more = zatm_boards;
-	zatm_boards = dev;
-	ret = 0;
-out:
-	return ret;
-
-out_release:
-	pci_release_regions(pci_dev);
-out_disable:
-	pci_disable_device(pci_dev);
-out_deregister:
-	atm_dev_deregister(dev);
-out_free:
-	kfree(zatm_dev);
-	goto out;
-}
-
-
-MODULE_LICENSE("GPL");
-
-static const struct pci_device_id zatm_pci_tbl[] = {
-	{ PCI_VDEVICE(ZEITNET, PCI_DEVICE_ID_ZEITNET_1221), ZATM_COPPER },
-	{ PCI_VDEVICE(ZEITNET, PCI_DEVICE_ID_ZEITNET_1225), 0 },
-	{ 0, }
-};
-MODULE_DEVICE_TABLE(pci, zatm_pci_tbl);
-
-static struct pci_driver zatm_driver = {
-	.name =		DEV_LABEL,
-	.id_table =	zatm_pci_tbl,
-	.probe =	zatm_init_one,
-};
-
-static int __init zatm_init_module(void)
-{
-	return pci_register_driver(&zatm_driver);
-}
-
-module_init(zatm_init_module);
-/* module_exit not defined so not unloadable */
diff --git a/drivers/atm/zatm.h b/drivers/atm/zatm.h
deleted file mode 100644
index 8204369fe825..000000000000
--- a/drivers/atm/zatm.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* drivers/atm/zatm.h - ZeitNet ZN122x device driver declarations */
-
-/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#ifndef DRIVER_ATM_ZATM_H
-#define DRIVER_ATM_ZATM_H
-
-#include <linux/skbuff.h>
-#include <linux/atm.h>
-#include <linux/atmdev.h>
-#include <linux/sonet.h>
-#include <linux/pci.h>
-
-
-#define DEV_LABEL	"zatm"
-
-#define MAX_AAL5_PDU	10240	/* allocate for AAL5 PDUs of this size */
-#define MAX_RX_SIZE_LD	14	/* ceil(log2((MAX_AAL5_PDU+47)/48)) */
-
-#define LOW_MARK	12	/* start adding new buffers if less than 12 */
-#define HIGH_MARK	30	/* stop adding buffers after reaching 30 */
-#define OFF_CNG_THRES	5	/* threshold for offset changes */
-
-#define RX_SIZE		2	/* RX lookup entry size (in bytes) */
-#define NR_POOLS	32	/* number of free buffer pointers */
-#define POOL_SIZE	8	/* buffer entry size (in bytes) */
-#define NR_SHAPERS	16	/* number of shapers */
-#define SHAPER_SIZE	4	/* shaper entry size (in bytes) */
-#define VC_SIZE		32	/* VC dsc (TX or RX) size (in bytes) */
-
-#define RING_ENTRIES	32	/* ring entries (without back pointer) */
-#define RING_WORDS	4	/* ring element size */
-#define RING_SIZE	(sizeof(unsigned long)*(RING_ENTRIES+1)*RING_WORDS)
-
-#define NR_MBX		4	/* four mailboxes */
-#define MBX_RX_0	0	/* mailbox indices */
-#define MBX_RX_1	1
-#define MBX_TX_0	2
-#define MBX_TX_1	3
-
-struct zatm_vcc {
-	/*-------------------------------- RX part */
-	int rx_chan;			/* RX channel, 0 if none */
-	int pool;			/* free buffer pool */
-	/*-------------------------------- TX part */
-	int tx_chan;			/* TX channel, 0 if none */
-	int shaper;			/* shaper, <0 if none */
-	struct sk_buff_head tx_queue;	/* list of buffers in transit */
-	wait_queue_head_t tx_wait;	/* for close */
-	u32 *ring;			/* transmit ring */
-	int ring_curr;			/* current write position */
-	int txing;			/* number of transmits in progress */
-	struct sk_buff_head backlog;	/* list of buffers waiting for ring */
-};
-
-struct zatm_dev {
-	/*-------------------------------- TX part */
-	int tx_bw;			/* remaining bandwidth */
-	u32 free_shapers;		/* bit set */
-	int ubr;			/* UBR shaper; -1 if none */
-	int ubr_ref_cnt;		/* number of VCs using UBR shaper */
-	/*-------------------------------- RX part */
-	int pool_ref[NR_POOLS];		/* free buffer pool usage counters */
-	volatile struct sk_buff *last_free[NR_POOLS];
-					/* last entry in respective pool */
-	struct sk_buff_head pool[NR_POOLS];/* free buffer pools */
-	struct zatm_pool_info pool_info[NR_POOLS]; /* pool information */
-	/*-------------------------------- maps */
-	struct atm_vcc **tx_map;	/* TX VCCs */
-	struct atm_vcc **rx_map;	/* RX VCCs */
-	int chans;			/* map size, must be 2^n */
-	/*-------------------------------- mailboxes */
-	unsigned long mbx_start[NR_MBX];/* start addresses */
-	dma_addr_t mbx_dma[NR_MBX];
-	u16 mbx_end[NR_MBX];		/* end offset (in bytes) */
-	/*-------------------------------- other pointers */
-	u32 pool_base;			/* Free buffer pool dsc (word addr) */
-	/*-------------------------------- ZATM links */
-	struct atm_dev *more;		/* other ZATM devices */
-	/*-------------------------------- general information */
-	int mem;			/* RAM on board (in bytes) */
-	int khz;			/* timer clock */
-	int copper;			/* PHY type */
-	unsigned char irq;		/* IRQ */
-	unsigned int base;		/* IO base address */
-	struct pci_dev *pci_dev;	/* PCI stuff */
-	spinlock_t lock;
-};
-
-
-#define ZATM_DEV(d) ((struct zatm_dev *) (d)->dev_data)
-#define ZATM_VCC(d) ((struct zatm_vcc *) (d)->dev_data)
-
-
-struct zatm_skb_prv {
-	struct atm_skb_data _;		/* reserved */
-	u32 *dsc;			/* pointer to skb's descriptor */
-};
-
-#define ZATM_PRV_DSC(skb) (((struct zatm_skb_prv *) (skb)->cb)->dsc)
-
-#endif
diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h
deleted file mode 100644
index 5135027b93c1..000000000000
--- a/include/uapi/linux/atm_zatm.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* atm_zatm.h - Driver-specific declarations of the ZATM driver (for use by
-		driver-specific utilities) */
-
-/* Written 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#ifndef LINUX_ATM_ZATM_H
-#define LINUX_ATM_ZATM_H
-
-/*
- * Note: non-kernel programs including this file must also include
- * sys/types.h for struct timeval
- */
-
-#include <linux/atmapi.h>
-#include <linux/atmioc.h>
-
-#define ZATM_GETPOOL	_IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc)
-						/* get pool statistics */
-#define ZATM_GETPOOLZ	_IOW('a',ATMIOC_SARPRV+2,struct atmif_sioc)
-						/* get statistics and zero */
-#define ZATM_SETPOOL	_IOW('a',ATMIOC_SARPRV+3,struct atmif_sioc)
-						/* set pool parameters */
-
-struct zatm_pool_info {
-	int ref_count;			/* free buffer pool usage counters */
-	int low_water,high_water;	/* refill parameters */
-	int rqa_count,rqu_count;	/* queue condition counters */
-	int offset,next_off;		/* alignment optimizations: offset */
-	int next_cnt,next_thres;	/* repetition counter and threshold */
-};
-
-struct zatm_pool_req {
-	int pool_num;			/* pool number */
-	struct zatm_pool_info info;	/* actual information */
-};
-
-#define ZATM_OAM_POOL		0	/* free buffer pool for OAM cells */
-#define ZATM_AAL0_POOL		1	/* free buffer pool for AAL0 cells */
-#define ZATM_AAL5_POOL_BASE	2	/* first AAL5 free buffer pool */
-#define ZATM_LAST_POOL	ZATM_AAL5_POOL_BASE+10 /* max. 64 kB */
-
-#define ZATM_TIMER_HISTORY_SIZE	16	/* number of timer adjustments to
-					   record; must be 2^n */
-
-#endif
-- 
cgit v1.2.3


From c35fe2a68f29a0bda15ae994154cacaae5f69791 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 25 Apr 2022 16:18:33 +0100
Subject: elf: Fix the arm64 MTE ELF segment name and value

Unfortunately, the name/value choice for the MTE ELF segment type
(PT_ARM_MEMTAG_MTE) was pretty poor: LOPROC+1 is already in use by
PT_AARCH64_UNWIND, as defined in the AArch64 ELF ABI
(https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst).

Update the ELF segment type value to LOPROC+2 and also change the define
to PT_AARCH64_MEMTAG_MTE to match the AArch64 ELF ABI namespace. The
AArch64 ELF ABI document is updating accordingly (segment type not
previously mentioned in the document).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 761b9b366cec ("elf: Introduce the ARM MTE ELF segment type")
Cc: Will Deacon <will@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Machado <luis.machado@arm.com>
Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>
Link: https://lore.kernel.org/r/20220425151833.2603830-1-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/memory-tagging-extension.rst | 4 ++--
 arch/arm64/kernel/elfcore.c                      | 2 +-
 include/uapi/linux/elf.h                         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst
index dd27f78d7608..dbae47bba25e 100644
--- a/Documentation/arm64/memory-tagging-extension.rst
+++ b/Documentation/arm64/memory-tagging-extension.rst
@@ -228,10 +228,10 @@ Core dump support
 -----------------
 
 The allocation tags for user memory mapped with ``PROT_MTE`` are dumped
-in the core file as additional ``PT_ARM_MEMTAG_MTE`` segments. The
+in the core file as additional ``PT_AARCH64_MEMTAG_MTE`` segments. The
 program header for such segment is defined as:
 
-:``p_type``: ``PT_ARM_MEMTAG_MTE``
+:``p_type``: ``PT_AARCH64_MEMTAG_MTE``
 :``p_flags``: 0
 :``p_offset``: segment file offset
 :``p_vaddr``: segment virtual address, same as the corresponding
diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c
index 2b3f3d0544b9..98d67444a5b6 100644
--- a/arch/arm64/kernel/elfcore.c
+++ b/arch/arm64/kernel/elfcore.c
@@ -95,7 +95,7 @@ int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
 	for_each_mte_vma(current, vma) {
 		struct elf_phdr phdr;
 
-		phdr.p_type = PT_ARM_MEMTAG_MTE;
+		phdr.p_type = PT_AARCH64_MEMTAG_MTE;
 		phdr.p_offset = offset;
 		phdr.p_vaddr = vma->vm_start;
 		phdr.p_paddr = 0;
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 787c657bfae8..7ce993e6786c 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -42,7 +42,7 @@ typedef __s64	Elf64_Sxword;
 
 
 /* ARM MTE memory tag segment type */
-#define PT_ARM_MEMTAG_MTE	(PT_LOPROC + 0x1)
+#define PT_AARCH64_MEMTAG_MTE	(PT_LOPROC + 0x2)
 
 /*
  * Extended Numbering
-- 
cgit v1.2.3


From d495f942f40aa412f8d4d65951152648cfa09903 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 Apr 2022 12:30:13 +0200
Subject: KVM: fix bad user ABI for KVM_EXIT_SYSTEM_EVENT

When KVM_EXIT_SYSTEM_EVENT was introduced, it included a flags
member that at the time was unused.  Unfortunately this extensibility
mechanism has several issues:

- x86 is not writing the member, so it would not be possible to use it
  on x86 except for new events

- the member is not aligned to 64 bits, so the definition of the
  uAPI struct is incorrect for 32- on 64-bit userspace.  This is a
  problem for RISC-V, which supports CONFIG_KVM_COMPAT, but fortunately
  usage of flags was only introduced in 5.18.

Since padding has to be introduced, place a new field in there
that tells if the flags field is valid.  To allow further extensibility,
in fact, change flags to an array of 16 values, and store how many
of the values are valid.  The availability of the new ndata field
is tied to a system capability; all architectures are changed to
fill in the field.

To avoid breaking compilation of userspace that was using the flags
field, provide a userspace-only union to overlap flags with data[0].
The new field is placed at the same offset for both 32- and 64-bit
userspace.

Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Peter Gonda <pgonda@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Message-Id: <20220422103013.34832-1-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 24 +++++++++++++++++-------
 arch/arm64/kvm/psci.c          |  3 ++-
 arch/riscv/kvm/vcpu_sbi.c      |  5 +++--
 arch/x86/kvm/x86.c             |  2 ++
 include/uapi/linux/kvm.h       | 10 +++++++++-
 virt/kvm/kvm_main.c            |  1 +
 6 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 85c7abc51af5..4a900cdbc62e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
-			__u64 flags;
+                        __u32 ndata;
+                        __u64 data[16];
 		} system_event;
 
 If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
 a system-level event using some architecture specific mechanism (hypercall
 or some special instruction). In case of ARM64, this is triggered using
-HVC instruction based PSCI call from the vcpu. The 'type' field describes
-the system-level event type. The 'flags' field describes architecture
-specific flags for the system-level event.
+HVC instruction based PSCI call from the vcpu.
 
+The 'type' field describes the system-level event type.
 Valid values for 'type' are:
 
  - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
@@ -6010,10 +6010,20 @@ Valid values for 'type' are:
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
 
-Valid flags are:
+If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
+architecture specific information for the system-level event.  Only
+the first `ndata` items (possibly zero) of the data array are valid.
 
- - KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
-   a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
+ - for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
+   the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
+   specification.
+
+ - for RISC-V, data[0] is set to the value of the second argument of the
+   ``sbi_system_reset`` call.
+
+Previous versions of Linux defined a `flags` member in this struct.  The
+field is now aliased to `data[0]`.  Userspace can assume that it is only
+written if ndata is greater than 0.
 
 ::
 
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index baac2b405f23..708d80e8e60d 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
 
 	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
 	vcpu->run->system_event.type = type;
-	vcpu->run->system_event.flags = flags;
+	vcpu->run->system_event.ndata = 1;
+	vcpu->run->system_event.data[0] = flags;
 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 }
 
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index a09ecb97b890..d45e7da3f0d3 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
 				     struct kvm_run *run,
-				     u32 type, u64 flags)
+				     u32 type, u64 reason)
 {
 	unsigned long i;
 	struct kvm_vcpu *tmp;
@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
 
 	memset(&run->system_event, 0, sizeof(run->system_event));
 	run->system_event.type = type;
-	run->system_event.flags = flags;
+	run->system_event.ndata = 1;
+	run->system_event.data[0] = reason;
 	run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43174a8d9497..07d789b1d366 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10015,12 +10015,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			vcpu->run->system_event.ndata = 0;
 			r = 0;
 			goto out;
 		}
 		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			vcpu->run->system_event.ndata = 0;
 			r = 0;
 			goto out;
 		}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 91a6fe4e02c0..6a184d260c7f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -445,7 +445,13 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
 			__u32 type;
-			__u64 flags;
+			__u32 ndata;
+			union {
+#ifndef __KERNEL__
+				__u64 flags;
+#endif
+				__u64 data[16];
+			};
 		} system_event;
 		/* KVM_EXIT_S390_STSI */
 		struct {
@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
 #define KVM_CAP_DISABLE_QUIRKS2 213
+/* #define KVM_CAP_VM_TSC_CONTROL 214 */
+#define KVM_CAP_SYSTEM_EVENT_DATA 215
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dfb7dabdbc63..ac57fc2c935f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4333,6 +4333,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 		return 0;
 #endif
 	case KVM_CAP_BINARY_STATS_FD:
+	case KVM_CAP_SYSTEM_EVENT_DATA:
 		return 1;
 	default:
 		break;
-- 
cgit v1.2.3


From 0e0af57e0e91b304f36b7d1dba859e3c04094273 Mon Sep 17 00:00:00 2001
From: "Dr. Thomas Orgis" <thomas.orgis@uni-hamburg.de>
Date: Fri, 29 Apr 2022 14:38:03 -0700
Subject: taskstats: version 12 with thread group and exe info

The task exit struct needs some crucial information to be able to provide
an enhanced version of process and thread accounting.  This change
provides:

1. ac_tgid in additon to ac_pid
2. thread group execution walltime in ac_tgetime
3. flag AGROUP in ac_flag to indicate the last task
   in a thread group / process
4. device ID and inode of task's /proc/self/exe in
   ac_exe_dev and ac_exe_inode
5. tools/accounting/procacct as demonstrator

When a task exits, taskstats are reported to userspace including the
task's pid and ppid, but without the id of the thread group this task is
part of.  Without the tgid, the stats of single tasks cannot be correlated
to each other as a thread group (process).

The taskstats documentation suggests that on process exit a data set
consisting of accumulated stats for the whole group is produced.  But such
an additional set of stats is only produced for actually multithreaded
processes, not groups that had only one thread, and also those stats only
contain data about delay accounting and not the more basic information
about CPU and memory resource usage.  Adding the AGROUP flag to be set
when the last task of a group exited enables determination of process end
also for single-threaded processes.

My applicaton basically does enhanced process accounting with summed
cputime, biggest maxrss, tasks per process.  The data is not available
with the traditional BSD process accounting (which is not designed to be
extensible) and the taskstats interface allows more efficient on-the-fly
grouping and summing of the stats, anyway, without intermediate disk
writes.

Furthermore, I do carry statistics on which exact program binary is used
how often with associated resources, getting a picture on how important
which parts of a collection of installed scientific software in different
versions are, and how well they put load on the machine.  This is enabled
by providing information on /proc/self/exe for each task.  I assume the
two 64-bit fields for device ID and inode are more appropriate than the
possibly large resolved path to keep the data volume down.

Add the tgid to the stats to complete task identification, the flag AGROUP
to mark the last task of a group, the group wallclock time, and
inode-based identification of the associated executable file.

Add tools/accounting/procacct.c as a simplified fork of getdelays.c to
demonstrate process and thread accounting.

[thomas.orgis@uni-hamburg.de: fix version number in comment]
  Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster
Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster
Signed-off-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de>
Reviewed-by: Ismael Luceno <ismael@iodev.co.uk>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/acct.h      |   3 +-
 include/uapi/linux/taskstats.h |  24 ++-
 kernel/taskstats.c             |  23 +++
 kernel/tsacct.c                |  10 +-
 tools/accounting/.gitignore    |   1 +
 tools/accounting/Makefile      |   2 +-
 tools/accounting/procacct.c    | 417 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 473 insertions(+), 7 deletions(-)
 create mode 100644 tools/accounting/procacct.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/acct.h b/include/uapi/linux/acct.h
index 985b89068591..0e591152aa8a 100644
--- a/include/uapi/linux/acct.h
+++ b/include/uapi/linux/acct.h
@@ -103,12 +103,13 @@ struct acct_v3
 /*
  *  accounting flags
  */
-				/* bit set when the process ... */
+				/* bit set when the process/task ... */
 #define AFORK		0x01	/* ... executed fork, but did not exec */
 #define ASU		0x02	/* ... used super-user privileges */
 #define ACOMPAT		0x04	/* ... used compatibility mode (VAX only not used) */
 #define ACORE		0x08	/* ... dumped core */
 #define AXSIG		0x10	/* ... was killed by a signal */
+#define AGROUP		0x20	/* ... was the last task of the process (task group) */
 
 #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
 #define ACCT_BYTEORDER	0x80	/* accounting file is big endian */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 12327d32378f..736154171489 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	11
+#define TASKSTATS_VERSION	12
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -48,7 +48,8 @@ struct taskstats {
 	__u32	ac_exitcode;		/* Exit status */
 
 	/* The accounting flags of a task as defined in <linux/acct.h>
-	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP.
+	 * (AGROUP since version 12).
 	 */
 	__u8	ac_flag;		/* Record flags */
 	__u8	ac_nice;		/* task_nice */
@@ -173,9 +174,26 @@ struct taskstats {
 	/* v10: 64-bit btime to avoid overflow */
 	__u64	ac_btime64;		/* 64-bit begin time */
 
-	/* Delay waiting for memory compact */
+	/* v11: Delay waiting for memory compact */
 	__u64	compact_count;
 	__u64	compact_delay_total;
+
+	/* v12 begin */
+	__u32   ac_tgid;	/* thread group ID */
+	/* Thread group walltime up to now. This is total process walltime if
+	 * AGROUP flag is set.
+	 */
+	__u64	ac_tgetime __attribute__((aligned(8)));
+	/* Lightweight information to identify process binary files.
+	 * This leaves userspace to match this to a file system path, using
+	 * MAJOR() and MINOR() macros to identify a device and mount point,
+	 * the inode to identify the executable file. This is /proc/self/exe
+	 * at the end, so matching the most recent exec(). Values are zero
+	 * for kernel threads.
+	 */
+	__u64   ac_exe_dev;     /* program binary device ID */
+	__u64   ac_exe_inode;   /* program binary inode number */
+	/* v12 end */
 };
 
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bcac5a9043aa..72415e22342b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
+#include <linux/acct.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb,
 	up_write(&listeners->sem);
 }
 
+static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	/* No idea if I'm allowed to access that here, now. */
+	struct file *exe_file = get_task_exe_file(tsk);
+
+	if (exe_file) {
+		/* Following cp_new_stat64() in stat.c . */
+		stats->ac_exe_dev =
+			huge_encode_dev(exe_file->f_inode->i_sb->s_dev);
+		stats->ac_exe_inode = exe_file->f_inode->i_ino;
+		fput(exe_file);
+	} else {
+		stats->ac_exe_dev = 0;
+		stats->ac_exe_inode = 0;
+	}
+}
+
 static void fill_stats(struct user_namespace *user_ns,
 		       struct pid_namespace *pid_ns,
 		       struct task_struct *tsk, struct taskstats *stats)
@@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns,
 
 	/* fill in extended acct fields */
 	xacct_add_tsk(stats, tsk);
+
+	/* add executable info */
+	exe_add_tsk(stats, tsk);
 }
 
 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
@@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 		goto err;
 
 	fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+	if (group_dead)
+		stats->ac_flag |= AGROUP;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1d261fbe367b..4252f0645b9e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 {
 	const struct cred *tcred;
 	u64 utime, stime, utimescaled, stimescaled;
-	u64 delta;
+	u64 now_ns, delta;
 	time64_t btime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 
 	/* calculate task elapsed time in nsec */
-	delta = ktime_get_ns() - tsk->start_time;
+	now_ns = ktime_get_ns();
+	/* store whole group time first */
+	delta = now_ns - tsk->group_leader->start_time;
 	/* Convert to micro seconds */
 	do_div(delta, NSEC_PER_USEC);
+	stats->ac_tgetime = delta;
+	delta = now_ns - tsk->start_time;
+	do_div(delta, NSEC_PER_USEC);
 	stats->ac_etime = delta;
 	/* Convert to seconds for btime (note y2106 limit) */
 	btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
@@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
 	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);
+	stats->ac_tgid   = task_tgid_nr_ns(tsk, pid_ns);
 	rcu_read_lock();
 	tcred = __task_cred(tsk);
 	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid);
diff --git a/tools/accounting/.gitignore b/tools/accounting/.gitignore
index c45fb4ed4309..522a690aaf3d 100644
--- a/tools/accounting/.gitignore
+++ b/tools/accounting/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 getdelays
+procacct
diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile
index 03687f19cbb1..11def1ad046c 100644
--- a/tools/accounting/Makefile
+++ b/tools/accounting/Makefile
@@ -2,7 +2,7 @@
 CC := $(CROSS_COMPILE)gcc
 CFLAGS := -I../../usr/include
 
-PROGS := getdelays
+PROGS := getdelays procacct
 
 all: $(PROGS)
 
diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c
new file mode 100644
index 000000000000..8353d3237e50
--- /dev/null
+++ b/tools/accounting/procacct.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0
+/* procacct.c
+ *
+ * Demonstrator of fetching resource data on task exit, as a way
+ * to accumulate accurate program resource usage statistics, without
+ * prior identification of the programs. For that, the fields for
+ * device and inode of the program executable binary file are also
+ * extracted in addition to the command string.
+ *
+ * The TGID together with the PID and the AGROUP flag allow
+ * identification of threads in a process and single-threaded processes.
+ * The ac_tgetime field gives proper whole-process walltime.
+ *
+ * Written (changed) by Thomas Orgis, University of Hamburg in 2022
+ *
+ * This is a cheap derivation (inheriting the style) of getdelays.c:
+ *
+ * Utility to get per-pid and per-tgid delay accounting statistics
+ * Also illustrates usage of the taskstats interface
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2005
+ * Copyright (C) Balbir Singh, IBM Corp. 2006
+ * Copyright (c) Jay Lan, SGI. 2006
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <poll.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <linux/genetlink.h>
+#include <linux/acct.h>
+#include <linux/taskstats.h>
+#include <linux/kdev_t.h>
+
+/*
+ * Generic macros for dealing with netlink sockets. Might be duplicated
+ * elsewhere. It is recommended that commercial grade applications use
+ * libnl or libnetlink and use the interfaces provided by the library
+ */
+#define GENLMSG_DATA(glh)	((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na)		((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)
+
+#define err(code, fmt, arg...)			\
+	do {					\
+		fprintf(stderr, fmt, ##arg);	\
+		exit(code);			\
+	} while (0)
+
+int rcvbufsz;
+char name[100];
+int dbg;
+int print_delays;
+int print_io_accounting;
+int print_task_context_switch_counts;
+
+#define PRINTF(fmt, arg...) {			\
+		if (dbg) {			\
+			printf(fmt, ##arg);	\
+		}				\
+	}
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE	1024
+/* Maximum number of cpus expected to be specified in a cpumask */
+#define MAX_CPUS	32
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char buf[MAX_MSG_SIZE];
+};
+
+char cpumask[100+6*MAX_CPUS];
+
+static void usage(void)
+{
+	fprintf(stderr, "procacct [-v] [-w logfile] [-r bufsize] [-m cpumask]\n");
+	fprintf(stderr, "  -v: debug on\n");
+}
+
+/*
+ * Create a raw netlink socket and bind
+ */
+static int create_nl_socket(int protocol)
+{
+	int fd;
+	struct sockaddr_nl local;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+	if (fd < 0)
+		return -1;
+
+	if (rcvbufsz)
+		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
+				&rcvbufsz, sizeof(rcvbufsz)) < 0) {
+			fprintf(stderr, "Unable to set socket rcv buf size to %d\n",
+				rcvbufsz);
+			goto error;
+		}
+
+	memset(&local, 0, sizeof(local));
+	local.nl_family = AF_NETLINK;
+
+	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
+		goto error;
+
+	return fd;
+error:
+	close(fd);
+	return -1;
+}
+
+
+static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr))) < buflen) {
+		if (r > 0) {
+			buf += r;
+			buflen -= r;
+		} else if (errno != EAGAIN)
+			return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * Probe the controller in genetlink to find the family id
+ * for the TASKSTATS family
+ */
+static int get_family_id(int sd)
+{
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[256];
+	} ans;
+
+	int id = 0, rc;
+	struct nlattr *na;
+	int rep_len;
+
+	strcpy(name, TASKSTATS_GENL_NAME);
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKSTATS_GENL_NAME)+1);
+	if (rc < 0)
+		return 0;	/* sendto() failure? */
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
+
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
+		id = *(__u16 *) NLA_DATA(na);
+
+	return id;
+}
+
+#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
+
+static void print_procacct(struct taskstats *t)
+{
+	/* First letter: T is a mere thread, G the last in a group, U  unknown. */
+	printf(
+		"%c pid=%lu tgid=%lu uid=%lu wall=%llu gwall=%llu cpu=%llu vmpeak=%llu rsspeak=%llu dev=%lu:%lu inode=%llu comm=%s\n"
+	,	t->version >= 12 ? (t->ac_flag & AGROUP ? 'P' : 'T') : '?'
+	,	(unsigned long)t->ac_pid
+	,	(unsigned long)(t->version >= 12 ? t->ac_tgid : 0)
+	,	(unsigned long)t->ac_uid
+	,	(unsigned long long)t->ac_etime
+	,	(unsigned long long)(t->version >= 12 ? t->ac_tgetime : 0)
+	,	(unsigned long long)(t->ac_utime+t->ac_stime)
+	,	(unsigned long long)t->hiwater_vm
+	,	(unsigned long long)t->hiwater_rss
+	,	(unsigned long)(t->version >= 12 ? MAJOR(t->ac_exe_dev) : 0)
+	,	(unsigned long)(t->version >= 12 ? MINOR(t->ac_exe_dev) : 0)
+	,	(unsigned long long)(t->version >= 12 ? t->ac_exe_inode : 0)
+	,	t->ac_comm
+	);
+}
+
+void handle_aggr(int mother, struct nlattr *na, int fd)
+{
+	int aggr_len = NLA_PAYLOAD(na->nla_len);
+	int len2 = 0;
+	pid_t rtid = 0;
+
+	na = (struct nlattr *) NLA_DATA(na);
+	while (len2 < aggr_len) {
+		switch (na->nla_type) {
+		case TASKSTATS_TYPE_PID:
+			rtid = *(int *) NLA_DATA(na);
+			PRINTF("PID\t%d\n", rtid);
+			break;
+		case TASKSTATS_TYPE_TGID:
+			rtid = *(int *) NLA_DATA(na);
+			PRINTF("TGID\t%d\n", rtid);
+			break;
+		case TASKSTATS_TYPE_STATS:
+			if (mother == TASKSTATS_TYPE_AGGR_PID)
+				print_procacct((struct taskstats *) NLA_DATA(na));
+			if (fd) {
+				if (write(fd, NLA_DATA(na), na->nla_len) < 0)
+					err(1, "write error\n");
+			}
+			break;
+		case TASKSTATS_TYPE_NULL:
+			break;
+		default:
+			fprintf(stderr, "Unknown nested nla_type %d\n",
+				na->nla_type);
+			break;
+		}
+		len2 += NLA_ALIGN(na->nla_len);
+		na = (struct nlattr *)((char *)na +
+						 NLA_ALIGN(na->nla_len));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int c, rc, rep_len, aggr_len, len2;
+	int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
+	__u16 id;
+	__u32 mypid;
+
+	struct nlattr *na;
+	int nl_sd = -1;
+	int len = 0;
+	pid_t tid = 0;
+
+	int fd = 0;
+	int write_file = 0;
+	int maskset = 0;
+	char *logfile = NULL;
+	int containerset = 0;
+	char *containerpath = NULL;
+	int cfd = 0;
+	int forking = 0;
+	sigset_t sigset;
+
+	struct msgtemplate msg;
+
+	while (!forking) {
+		c = getopt(argc, argv, "m:vr:");
+		if (c < 0)
+			break;
+
+		switch (c) {
+		case 'w':
+			logfile = strdup(optarg);
+			printf("write to file %s\n", logfile);
+			write_file = 1;
+			break;
+		case 'r':
+			rcvbufsz = atoi(optarg);
+			printf("receive buf size %d\n", rcvbufsz);
+			if (rcvbufsz < 0)
+				err(1, "Invalid rcv buf size\n");
+			break;
+		case 'm':
+			strncpy(cpumask, optarg, sizeof(cpumask));
+			cpumask[sizeof(cpumask) - 1] = '\0';
+			maskset = 1;
+			break;
+		case 'v':
+			printf("debug on\n");
+			dbg = 1;
+			break;
+		default:
+			usage();
+			exit(-1);
+		}
+	}
+	if (!maskset) {
+		maskset = 1;
+		strncpy(cpumask, "1", sizeof(cpumask));
+		cpumask[sizeof(cpumask) - 1] = '\0';
+	}
+	printf("cpumask %s maskset %d\n", cpumask, maskset);
+
+	if (write_file) {
+		fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd == -1) {
+			perror("Cannot open output file\n");
+			exit(1);
+		}
+	}
+
+	nl_sd = create_nl_socket(NETLINK_GENERIC);
+	if (nl_sd < 0)
+		err(1, "error creating Netlink socket\n");
+
+	mypid = getpid();
+	id = get_family_id(nl_sd);
+	if (!id) {
+		fprintf(stderr, "Error getting family id, errno %d\n", errno);
+		goto err;
+	}
+	PRINTF("family id %d\n", id);
+
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+			      &cpumask, strlen(cpumask) + 1);
+		PRINTF("Sent register cpumask, retval %d\n", rc);
+		if (rc < 0) {
+			fprintf(stderr, "error sending register cpumask\n");
+			goto err;
+		}
+	}
+
+	do {
+		rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
+		PRINTF("received %d bytes\n", rep_len);
+
+		if (rep_len < 0) {
+			fprintf(stderr, "nonfatal reply error: errno %d\n",
+				errno);
+			continue;
+		}
+		if (msg.n.nlmsg_type == NLMSG_ERROR ||
+		    !NLMSG_OK((&msg.n), rep_len)) {
+			struct nlmsgerr *err = NLMSG_DATA(&msg);
+
+			fprintf(stderr, "fatal reply error,  errno %d\n",
+				err->error);
+			goto done;
+		}
+
+		PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n",
+		       sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
+
+
+		rep_len = GENLMSG_PAYLOAD(&msg.n);
+
+		na = (struct nlattr *) GENLMSG_DATA(&msg);
+		len = 0;
+		while (len < rep_len) {
+			len += NLA_ALIGN(na->nla_len);
+			int mother = na->nla_type;
+
+			PRINTF("mother=%i\n", mother);
+			switch (na->nla_type) {
+			case TASKSTATS_TYPE_AGGR_PID:
+			case TASKSTATS_TYPE_AGGR_TGID:
+				/* For nested attributes, na follows */
+				handle_aggr(mother, na, fd);
+				break;
+			default:
+				fprintf(stderr, "Unexpected nla_type %d\n",
+					na->nla_type);
+			case TASKSTATS_TYPE_NULL:
+				break;
+			}
+			na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
+		}
+	} while (1);
+done:
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
+			      &cpumask, strlen(cpumask) + 1);
+		printf("Sent deregister mask, retval %d\n", rc);
+		if (rc < 0)
+			err(rc, "error sending deregister cpumask\n");
+	}
+err:
+	close(nl_sd);
+	if (fd)
+		close(fd);
+	if (cfd)
+		close(cfd);
+	return 0;
+}
-- 
cgit v1.2.3


From f548a12efd5ab97e6b1fb332e5634ce44b3d9328 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Apr 2022 17:39:50 -0600
Subject: io_uring: return hint on whether more data is available after receive

For now just use a CQE flag for this, with big CQE support we could
return the actual number of bytes left.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 19 +++++++++++++++----
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5a0388bac42c..20c5d29e5b6c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5948,6 +5948,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
 	struct io_buffer *kbuf;
+	unsigned int cflags;
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -5981,6 +5982,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
+	kmsg->msg.msg_get_inq = 1;
+
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 					kmsg->uaddr, flags);
 	if (ret < min_ret) {
@@ -6006,7 +6009,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+	cflags = io_put_kbuf(req, issue_flags);
+	if (kmsg->msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
 }
 
@@ -6018,6 +6024,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	void __user *buf = sr->buf;
 	struct socket *sock;
 	struct iovec iov;
+	unsigned int cflags;
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -6038,11 +6045,12 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 		goto out_free;
 
 	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
 	msg.msg_control = NULL;
+	msg.msg_get_inq = 1;
+	msg.msg_flags = 0;
 	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
 	msg.msg_iocb = NULL;
-	msg.msg_flags = 0;
 
 	flags = req->sr_msg.msg_flags;
 	if (force_nonblock)
@@ -6073,7 +6081,10 @@ out_free:
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+	cflags = io_put_kbuf(req, issue_flags);
+	if (msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
 }
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 49d1f3994f8d..92d1799892b2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -220,9 +220,11 @@ struct io_uring_cqe {
  *
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
  * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
+ * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
+#define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
-- 
cgit v1.2.3


From e1169f06d5bbdbc2b22ae4e3083a4bf75ae5ecee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:03 -0600
Subject: io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used

If this is set, io_uring will never use an IPI to deliver a task_work
notification. This can be used in the common case where a single task or
thread communicates with the ring, and doesn't rely on
io_uring_cqe_peek().

This provides a noticeable win in performance, both from eliminating
the IPI itself, but also from avoiding interrupting the submitting
task unnecessarily.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-6-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 17 +++++++++++++----
 include/uapi/linux/io_uring.h |  8 ++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c669d8f5e57..0b9ae3615911 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -11327,12 +11327,20 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		ctx->user = get_uid(current_user());
 
 	/*
-	 * For SQPOLL, we just need a wakeup, always.
+	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
+	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
 	 */
-	if (ctx->flags & IORING_SETUP_SQPOLL)
+	ret = -EINVAL;
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		/* IPI related flags don't make sense with SQPOLL */
+		if (ctx->flags & IORING_SETUP_COOP_TASKRUN)
+			goto err;
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
-	else
+	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
+		ctx->notify_method = TWA_SIGNAL_NO_IPI;
+	} else {
 		ctx->notify_method = TWA_SIGNAL;
+	}
 
 	/*
 	 * This is just grabbed for accounting purposes. When a process exits,
@@ -11431,7 +11439,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
-			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
+			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+			IORING_SETUP_COOP_TASKRUN))
 		return -EINVAL;
 
 	return  io_uring_create(entries, &p, params);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 980d82eb196e..a84f29d657c3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -102,6 +102,14 @@ enum {
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN	(1U << 8)
 
 enum {
 	IORING_OP_NOP,
-- 
cgit v1.2.3


From ef060ea9e4fd3b763e7060a3af0a258d2d5d7c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Apr 2022 19:49:04 -0600
Subject: io_uring: add IORING_SETUP_TASKRUN_FLAG

If IORING_SETUP_COOP_TASKRUN is set to use cooperative scheduling for
running task_work, then IORING_SETUP_TASKRUN_FLAG can be set so the
application can tell if task_work is pending in the kernel for this
ring. This allows use cases like io_uring_peek_cqe() to still function
appropriately, or for the task to know when it would be useful to
call io_uring_wait_cqe() to run pending events.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20220426014904.60384-7-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 14 +++++++++++---
 include/uapi/linux/io_uring.h |  7 +++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b9ae3615911..72cb2d50125c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2506,6 +2506,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!ctx)
 		return;
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 	if (*locked) {
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
@@ -2646,6 +2648,9 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 	if (running)
 		return;
 
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
 	if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
 		return;
 
@@ -11333,12 +11338,15 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ret = -EINVAL;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		/* IPI related flags don't make sense with SQPOLL */
-		if (ctx->flags & IORING_SETUP_COOP_TASKRUN)
+		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
+				  IORING_SETUP_TASKRUN_FLAG))
 			goto err;
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else {
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+			goto err;
 		ctx->notify_method = TWA_SIGNAL;
 	}
 
@@ -11440,10 +11448,10 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
-			IORING_SETUP_COOP_TASKRUN))
+			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
 		return -EINVAL;
 
-	return  io_uring_create(entries, &p, params);
+	return io_uring_create(entries, &p, params);
 }
 
 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a84f29d657c3..fad63564678a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -110,6 +110,12 @@ enum {
  * a task running in userspace, and saves an IPI.
  */
 #define IORING_SETUP_COOP_TASKRUN	(1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 
 enum {
 	IORING_OP_NOP,
@@ -256,6 +262,7 @@ struct io_sqring_offsets {
  */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
 #define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
+#define IORING_SQ_TASKRUN	(1U << 2) /* task should enter the kernel */
 
 struct io_cqring_offsets {
 	__u32 head;
-- 
cgit v1.2.3


From 3254e0b9eb5649ffaa48717ebc9c593adc4ee6a9 Mon Sep 17 00:00:00 2001
From: Alexandru Tachici <alexandru.tachici@analog.com>
Date: Fri, 29 Apr 2022 18:34:31 +0300
Subject: ethtool: Add 10base-T1L link mode entry

Add entry for the 10base-T1L full duplex mode.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c   | 3 ++-
 drivers/net/phy/phy_device.c | 3 ++-
 drivers/net/phy/phylink.c    | 4 +++-
 include/linux/phy.h          | 2 +-
 include/uapi/linux/ethtool.h | 1 +
 net/ethtool/common.c         | 3 +++
 6 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 2001f3329133..1f2531a1a876 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -13,7 +13,7 @@
  */
 const char *phy_speed_to_str(int speed)
 {
-	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 92,
+	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 93,
 		"Enum ethtool_link_mode_bit_indices and phylib are out of sync. "
 		"If a speed or mode has been added please update phy_speed_to_str "
 		"and the PHY settings array.\n");
@@ -176,6 +176,7 @@ static const struct phy_setting settings[] = {
 	/* 10M */
 	PHY_SETTING(     10, FULL,     10baseT_Full		),
 	PHY_SETTING(     10, HALF,     10baseT_Half		),
+	PHY_SETTING(     10, FULL,     10baseT1L_Full		),
 };
 #undef PHY_SETTING
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index f867042b2eb4..1369daeded14 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -90,8 +90,9 @@ const int phy_10_100_features_array[4] = {
 };
 EXPORT_SYMBOL_GPL(phy_10_100_features_array);
 
-const int phy_basic_t1_features_array[2] = {
+const int phy_basic_t1_features_array[3] = {
 	ETHTOOL_LINK_MODE_TP_BIT,
+	ETHTOOL_LINK_MODE_10baseT1L_Full_BIT,
 	ETHTOOL_LINK_MODE_100baseT1_Full_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_basic_t1_features_array);
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 33c285252584..d707604d1d5a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -168,8 +168,10 @@ static void phylink_caps_to_linkmodes(unsigned long *linkmodes,
 	if (caps & MAC_10HD)
 		__set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, linkmodes);
 
-	if (caps & MAC_10FD)
+	if (caps & MAC_10FD) {
 		__set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, linkmodes);
+		__set_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, linkmodes);
+	}
 
 	if (caps & MAC_100HD) {
 		__set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, linkmodes);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 36ca2b5c2253..b12af9e2f389 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -65,7 +65,7 @@ extern const int phy_basic_ports_array[3];
 extern const int phy_fibre_port_array[1];
 extern const int phy_all_ports_features_array[7];
 extern const int phy_10_100_features_array[4];
-extern const int phy_basic_t1_features_array[2];
+extern const int phy_basic_t1_features_array[3];
 extern const int phy_gbit_features_array[2];
 extern const int phy_10gbit_features_array[1];
 
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7bc4b8def12c..e0f0ee9bc89e 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1691,6 +1691,7 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT	 = 89,
 	ETHTOOL_LINK_MODE_100baseFX_Half_BIT		 = 90,
 	ETHTOOL_LINK_MODE_100baseFX_Full_BIT		 = 91,
+	ETHTOOL_LINK_MODE_10baseT1L_Full_BIT		 = 92,
 	/* must be last entry */
 	__ETHTOOL_LINK_MODE_MASK_NBITS
 };
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 0c5210015911..566adf85e658 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -201,6 +201,7 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
 	__DEFINE_LINK_MODE_NAME(400000, CR4, Full),
 	__DEFINE_LINK_MODE_NAME(100, FX, Half),
 	__DEFINE_LINK_MODE_NAME(100, FX, Full),
+	__DEFINE_LINK_MODE_NAME(10, T1L, Full),
 };
 static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 
@@ -236,6 +237,7 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 #define __LINK_MODE_LANES_T1		1
 #define __LINK_MODE_LANES_X		1
 #define __LINK_MODE_LANES_FX		1
+#define __LINK_MODE_LANES_T1L		1
 
 #define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex)	\
 	[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = {		\
@@ -349,6 +351,7 @@ const struct link_mode_info link_mode_params[] = {
 	__DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
 	__DEFINE_LINK_MODE_PARAMS(100, FX, Half),
 	__DEFINE_LINK_MODE_PARAMS(100, FX, Full),
+	__DEFINE_LINK_MODE_PARAMS(10, T1L, Full),
 };
 static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 
-- 
cgit v1.2.3


From 909b4f2bf764a903e9183111368f1509f9b40e6d Mon Sep 17 00:00:00 2001
From: Alexandru Tachici <alexandru.tachici@analog.com>
Date: Fri, 29 Apr 2022 18:34:32 +0300
Subject: net: phy: Add 10-BaseT1L registers

The 802.3gc specification defines the 10-BaseT1L link
mode for ethernet trafic on twisted wire pair.

PMA status register can be used to detect if the phy supports
2.4 V TX level and PCS control register can be used to
enable/disable PCS level loopback.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mdio.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index c54e6eae5366..0b2eba36dd7c 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -67,6 +67,9 @@
 #define MDIO_PCS_10GBRT_STAT2	33	/* 10GBASE-R/-T PCS status 2 */
 #define MDIO_AN_10GBT_CTRL	32	/* 10GBASE-T auto-negotiation control */
 #define MDIO_AN_10GBT_STAT	33	/* 10GBASE-T auto-negotiation status */
+#define MDIO_B10L_PMA_CTRL	2294	/* 10BASE-T1L PMA control */
+#define MDIO_PMA_10T1L_STAT	2295	/* 10BASE-T1L PMA status */
+#define MDIO_PCS_10T1L_CTRL	2278	/* 10BASE-T1L PCS control */
 
 /* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */
 #define MDIO_PMA_LASI_RXCTRL	0x9000	/* RX_ALARM control */
@@ -268,6 +271,28 @@
 #define MDIO_AN_10GBT_STAT_MS		0x4000	/* Master/slave config */
 #define MDIO_AN_10GBT_STAT_MSFLT	0x8000	/* Master/slave config fault */
 
+/* 10BASE-T1L PMA control */
+#define MDIO_PMA_10T1L_CTRL_LB_EN	0x0001	/* Enable loopback mode */
+#define MDIO_PMA_10T1L_CTRL_EEE_EN	0x0400	/* Enable EEE mode */
+#define MDIO_PMA_10T1L_CTRL_LOW_POWER	0x0800	/* Low-power mode */
+#define MDIO_PMA_10T1L_CTRL_2V4_EN	0x1000	/* Enable 2.4 Vpp operating mode */
+#define MDIO_PMA_10T1L_CTRL_TX_DIS	0x4000	/* Transmit disable */
+#define MDIO_PMA_10T1L_CTRL_PMA_RST	0x8000	/* MA reset */
+
+/* 10BASE-T1L PMA status register. */
+#define MDIO_PMA_10T1L_STAT_LINK	0x0001	/* PMA receive link up */
+#define MDIO_PMA_10T1L_STAT_FAULT	0x0002	/* Fault condition detected */
+#define MDIO_PMA_10T1L_STAT_POLARITY	0x0004	/* Receive polarity is reversed */
+#define MDIO_PMA_10T1L_STAT_RECV_FAULT	0x0200	/* Able to detect fault on receive path */
+#define MDIO_PMA_10T1L_STAT_EEE		0x0400	/* PHY has EEE ability */
+#define MDIO_PMA_10T1L_STAT_LOW_POWER	0x0800	/* PMA has low-power ability */
+#define MDIO_PMA_10T1L_STAT_2V4_ABLE	0x1000	/* PHY has 2.4 Vpp operating mode ability */
+#define MDIO_PMA_10T1L_STAT_LB_ABLE	0x2000	/* PHY has loopback ability */
+
+/* 10BASE-T1L PCS control register. */
+#define MDIO_PCS_10T1L_CTRL_LB		0x4000	/* Enable PCS level loopback mode */
+#define MDIO_PCS_10T1L_CTRL_RESET	0x8000	/* PCS reset */
+
 /* EEE Supported/Advertisement/LP Advertisement registers.
  *
  * EEE capability Register (3.20), Advertisement (7.60) and
-- 
cgit v1.2.3


From 1b020e448e0fb67bcb04ee0f778d413045f965d3 Mon Sep 17 00:00:00 2001
From: Alexandru Tachici <alexandru.tachici@analog.com>
Date: Fri, 29 Apr 2022 18:34:33 +0300
Subject: net: phy: Add BaseT1 auto-negotiation registers

Added BASE-T1 AN advertisement register (Registers 7.514, 7.515, and
7.516) and BASE-T1 AN LP Base Page ability register (Registers 7.517,
7.518, and 7.519).

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mdio.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 0b2eba36dd7c..fa3515257f54 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -70,6 +70,14 @@
 #define MDIO_B10L_PMA_CTRL	2294	/* 10BASE-T1L PMA control */
 #define MDIO_PMA_10T1L_STAT	2295	/* 10BASE-T1L PMA status */
 #define MDIO_PCS_10T1L_CTRL	2278	/* 10BASE-T1L PCS control */
+#define MDIO_AN_T1_CTRL		512	/* BASE-T1 AN control */
+#define MDIO_AN_T1_STAT		513	/* BASE-T1 AN status */
+#define MDIO_AN_T1_ADV_L	514	/* BASE-T1 AN advertisement register [15:0] */
+#define MDIO_AN_T1_ADV_M	515	/* BASE-T1 AN advertisement register [31:16] */
+#define MDIO_AN_T1_ADV_H	516	/* BASE-T1 AN advertisement register [47:32] */
+#define MDIO_AN_T1_LP_L		517	/* BASE-T1 AN LP Base Page ability register [15:0] */
+#define MDIO_AN_T1_LP_M		518	/* BASE-T1 AN LP Base Page ability register [31:16] */
+#define MDIO_AN_T1_LP_H		519	/* BASE-T1 AN LP Base Page ability register [47:32] */
 
 /* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */
 #define MDIO_PMA_LASI_RXCTRL	0x9000	/* RX_ALARM control */
@@ -293,6 +301,38 @@
 #define MDIO_PCS_10T1L_CTRL_LB		0x4000	/* Enable PCS level loopback mode */
 #define MDIO_PCS_10T1L_CTRL_RESET	0x8000	/* PCS reset */
 
+/* BASE-T1 auto-negotiation advertisement register [15:0] */
+#define MDIO_AN_T1_ADV_L_PAUSE_CAP	ADVERTISE_PAUSE_CAP
+#define MDIO_AN_T1_ADV_L_PAUSE_ASYM	ADVERTISE_PAUSE_ASYM
+#define MDIO_AN_T1_ADV_L_FORCE_MS	0x1000	/* Force Master/slave Configuration */
+#define MDIO_AN_T1_ADV_L_REMOTE_FAULT	ADVERTISE_RFAULT
+#define MDIO_AN_T1_ADV_L_ACK		ADVERTISE_LPACK
+#define MDIO_AN_T1_ADV_L_NEXT_PAGE_REQ	ADVERTISE_NPAGE
+
+/* BASE-T1 auto-negotiation advertisement register [31:16] */
+#define MDIO_AN_T1_ADV_M_B10L		0x4000	/* device is compatible with 10BASE-T1L */
+#define MDIO_AN_T1_ADV_M_MST		0x0010	/* advertise master preference */
+
+/* BASE-T1 auto-negotiation advertisement register [47:32] */
+#define MDIO_AN_T1_ADV_H_10L_TX_HI_REQ	0x1000	/* 10BASE-T1L High Level Transmit Request */
+#define MDIO_AN_T1_ADV_H_10L_TX_HI	0x2000	/* 10BASE-T1L High Level Transmit Ability */
+
+/* BASE-T1 AN LP Base Page ability register [15:0] */
+#define MDIO_AN_T1_LP_L_PAUSE_CAP	LPA_PAUSE_CAP
+#define MDIO_AN_T1_LP_L_PAUSE_ASYM	LPA_PAUSE_ASYM
+#define MDIO_AN_T1_LP_L_FORCE_MS	0x1000	/* LP Force Master/slave Configuration */
+#define MDIO_AN_T1_LP_L_REMOTE_FAULT	LPA_RFAULT
+#define MDIO_AN_T1_LP_L_ACK		LPA_LPACK
+#define MDIO_AN_T1_LP_L_NEXT_PAGE_REQ	LPA_NPAGE
+
+/* BASE-T1 AN LP Base Page ability register [31:16] */
+#define MDIO_AN_T1_LP_M_MST		0x0010	/* LP master preference */
+#define MDIO_AN_T1_LP_M_B10L		0x4000	/* LP is compatible with 10BASE-T1L */
+
+/* BASE-T1 AN LP Base Page ability register [47:32] */
+#define MDIO_AN_T1_LP_H_10L_TX_HI_REQ	0x1000	/* 10BASE-T1L High Level LP Transmit Request */
+#define MDIO_AN_T1_LP_H_10L_TX_HI	0x2000	/* 10BASE-T1L High Level LP Transmit Ability */
+
 /* EEE Supported/Advertisement/LP Advertisement registers.
  *
  * EEE capability Register (3.20), Advertisement (7.60) and
-- 
cgit v1.2.3


From 3da8ffd8545f62fec85a48a3c637b2f427974f11 Mon Sep 17 00:00:00 2001
From: Alexandru Tachici <alexandru.tachici@analog.com>
Date: Fri, 29 Apr 2022 18:34:34 +0300
Subject: net: phy: Add 10BASE-T1L support in phy-c45

This patch is needed because the BASE-T1 uses different registers
for status, control and advertisement to those already
employed in the existing phy-c45 functions.

Where required, genphy_c45 functions will now check whether
the device supports BASE-T1 and use the specific registers
instead: 45.2.7.19 BASE-T1 AN control register,
45.2.7.20 BASE-T1 AN status, 45.2.7.21 BASE-T1 AN
advertisement register, 45.2.7.22 BASE-T1 AN LP Base
Page ability register, 45.2.1.185 BASE-T1 PMA/PMD control
register.

Tested-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-c45.c    | 257 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/net/phy/phy_device.c |   1 +
 include/linux/mdio.h         |  70 ++++++++++++
 include/linux/phy.h          |   3 +
 include/uapi/linux/mdio.h    |  10 ++
 5 files changed, 336 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index db709d30bf84..eefdd67d5556 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -8,6 +8,25 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 
+/**
+ * genphy_c45_baset1_able - checks if the PMA has BASE-T1 extended abilities
+ * @phydev: target phy_device struct
+ */
+static bool genphy_c45_baset1_able(struct phy_device *phydev)
+{
+	int val;
+
+	if (phydev->pma_extable == -ENODATA) {
+		val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_EXTABLE);
+		if (val < 0)
+			return false;
+
+		phydev->pma_extable = val;
+	}
+
+	return !!(phydev->pma_extable & MDIO_PMA_EXTABLE_BT1);
+}
+
 /**
  * genphy_c45_pma_can_sleep - checks if the PMA have sleep support
  * @phydev: target phy_device struct
@@ -80,7 +99,10 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev)
 
 	switch (phydev->speed) {
 	case SPEED_10:
-		ctrl2 |= MDIO_PMA_CTRL2_10BT;
+		if (genphy_c45_baset1_able(phydev))
+			ctrl2 |= MDIO_PMA_CTRL2_BASET1;
+		else
+			ctrl2 |= MDIO_PMA_CTRL2_10BT;
 		break;
 	case SPEED_100:
 		ctrl1 |= MDIO_PMA_CTRL1_SPEED100;
@@ -118,10 +140,95 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev)
 	if (ret < 0)
 		return ret;
 
+	if (genphy_c45_baset1_able(phydev)) {
+		int ctl = 0;
+
+		switch (phydev->master_slave_set) {
+		case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+		case MASTER_SLAVE_CFG_MASTER_FORCE:
+			ctl = MDIO_PMA_PMD_BT1_CTRL_CFG_MST;
+			break;
+		case MASTER_SLAVE_CFG_SLAVE_FORCE:
+		case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+		case MASTER_SLAVE_CFG_UNKNOWN:
+		case MASTER_SLAVE_CFG_UNSUPPORTED:
+			break;
+		default:
+			phydev_warn(phydev, "Unsupported Master/Slave mode\n");
+			return -EOPNOTSUPP;
+		}
+
+		ret = phy_modify_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_PMD_BT1_CTRL,
+				     MDIO_PMA_PMD_BT1_CTRL_CFG_MST, ctl);
+		if (ret < 0)
+			return ret;
+	}
+
 	return genphy_c45_an_disable_aneg(phydev);
 }
 EXPORT_SYMBOL_GPL(genphy_c45_pma_setup_forced);
 
+/* Sets master/slave preference and supported technologies.
+ * The preference is set in the BIT(4) of BASE-T1 AN
+ * advertisement register 7.515 and whether the status
+ * is forced or not, it is set in the BIT(12) of BASE-T1
+ * AN advertisement register 7.514.
+ * Sets 10BASE-T1L Ability BIT(14) in BASE-T1 autonegotiation
+ * advertisement register [31:16] if supported.
+ */
+static int genphy_c45_baset1_an_config_aneg(struct phy_device *phydev)
+{
+	int changed = 0;
+	u16 adv_l = 0;
+	u16 adv_m = 0;
+	int ret;
+
+	switch (phydev->master_slave_set) {
+	case MASTER_SLAVE_CFG_MASTER_FORCE:
+	case MASTER_SLAVE_CFG_SLAVE_FORCE:
+		adv_l |= MDIO_AN_T1_ADV_L_FORCE_MS;
+		break;
+	case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+	case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+		break;
+	default:
+		break;
+	}
+
+	switch (phydev->master_slave_set) {
+	case MASTER_SLAVE_CFG_MASTER_FORCE:
+	case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+		adv_m |= MDIO_AN_T1_ADV_M_MST;
+		break;
+	case MASTER_SLAVE_CFG_SLAVE_FORCE:
+	case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+		break;
+	default:
+		break;
+	}
+
+	adv_l |= linkmode_adv_to_mii_t1_adv_l_t(phydev->advertising);
+
+	ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_T1_ADV_L,
+				     (MDIO_AN_T1_ADV_L_FORCE_MS | MDIO_AN_T1_ADV_L_PAUSE_CAP
+				     | MDIO_AN_T1_ADV_L_PAUSE_ASYM), adv_l);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		changed = 1;
+
+	adv_m |= linkmode_adv_to_mii_t1_adv_m_t(phydev->advertising);
+
+	ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_T1_ADV_M,
+				     MDIO_AN_T1_ADV_M_MST | MDIO_AN_T1_ADV_M_B10L, adv_m);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		changed = 1;
+
+	return changed;
+}
+
 /**
  * genphy_c45_an_config_aneg - configure advertisement registers
  * @phydev: target phy_device struct
@@ -141,6 +248,9 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev)
 
 	changed = genphy_config_eee_advert(phydev);
 
+	if (genphy_c45_baset1_able(phydev))
+		return genphy_c45_baset1_an_config_aneg(phydev);
+
 	adv = linkmode_adv_to_mii_adv_t(phydev->advertising);
 
 	ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_ADVERTISE,
@@ -178,8 +288,12 @@ EXPORT_SYMBOL_GPL(genphy_c45_an_config_aneg);
  */
 int genphy_c45_an_disable_aneg(struct phy_device *phydev)
 {
+	u16 reg = MDIO_CTRL1;
 
-	return phy_clear_bits_mmd(phydev, MDIO_MMD_AN, MDIO_CTRL1,
+	if (genphy_c45_baset1_able(phydev))
+		reg = MDIO_AN_T1_CTRL;
+
+	return phy_clear_bits_mmd(phydev, MDIO_MMD_AN, reg,
 				  MDIO_AN_CTRL1_ENABLE | MDIO_AN_CTRL1_RESTART);
 }
 EXPORT_SYMBOL_GPL(genphy_c45_an_disable_aneg);
@@ -194,7 +308,12 @@ EXPORT_SYMBOL_GPL(genphy_c45_an_disable_aneg);
  */
 int genphy_c45_restart_aneg(struct phy_device *phydev)
 {
-	return phy_set_bits_mmd(phydev, MDIO_MMD_AN, MDIO_CTRL1,
+	u16 reg = MDIO_CTRL1;
+
+	if (genphy_c45_baset1_able(phydev))
+		reg = MDIO_AN_T1_CTRL;
+
+	return phy_set_bits_mmd(phydev, MDIO_MMD_AN, reg,
 				MDIO_AN_CTRL1_ENABLE | MDIO_AN_CTRL1_RESTART);
 }
 EXPORT_SYMBOL_GPL(genphy_c45_restart_aneg);
@@ -210,11 +329,15 @@ EXPORT_SYMBOL_GPL(genphy_c45_restart_aneg);
  */
 int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart)
 {
+	u16 reg = MDIO_CTRL1;
 	int ret;
 
+	if (genphy_c45_baset1_able(phydev))
+		reg = MDIO_AN_T1_CTRL;
+
 	if (!restart) {
 		/* Configure and restart aneg if it wasn't set before */
-		ret = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_CTRL1);
+		ret = phy_read_mmd(phydev, MDIO_MMD_AN, reg);
 		if (ret < 0)
 			return ret;
 
@@ -242,7 +365,13 @@ EXPORT_SYMBOL_GPL(genphy_c45_check_and_restart_aneg);
  */
 int genphy_c45_aneg_done(struct phy_device *phydev)
 {
-	int val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
+	int reg = MDIO_STAT1;
+	int val;
+
+	if (genphy_c45_baset1_able(phydev))
+		reg = MDIO_AN_T1_STAT;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_AN, reg);
 
 	return val < 0 ? val : val & MDIO_AN_STAT1_COMPLETE ? 1 : 0;
 }
@@ -307,6 +436,49 @@ int genphy_c45_read_link(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(genphy_c45_read_link);
 
+/* Read the Clause 45 defined BASE-T1 AN (7.513) status register to check
+ * if autoneg is complete. If so read the BASE-T1 Autonegotiation
+ * Advertisement registers filling in the link partner advertisement,
+ * pause and asym_pause members in phydev.
+ */
+static int genphy_c45_baset1_read_lpa(struct phy_device *phydev)
+{
+	int val;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_STAT);
+	if (val < 0)
+		return val;
+
+	if (!(val & MDIO_AN_STAT1_COMPLETE)) {
+		linkmode_clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, phydev->lp_advertising);
+		mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, 0);
+		mii_t1_adv_m_mod_linkmode_t(phydev->lp_advertising, 0);
+
+		phydev->pause = 0;
+		phydev->asym_pause = 0;
+
+		return 0;
+	}
+
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, phydev->lp_advertising, 1);
+
+	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_LP_L);
+	if (val < 0)
+		return val;
+
+	mii_t1_adv_l_mod_linkmode_t(phydev->lp_advertising, val);
+	phydev->pause = val & MDIO_AN_T1_ADV_L_PAUSE_CAP ? 1 : 0;
+	phydev->asym_pause = val & MDIO_AN_T1_ADV_L_PAUSE_ASYM ? 1 : 0;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_LP_M);
+	if (val < 0)
+		return val;
+
+	mii_t1_adv_m_mod_linkmode_t(phydev->lp_advertising, val);
+
+	return 0;
+}
+
 /**
  * genphy_c45_read_lpa - read the link partner advertisement and pause
  * @phydev: target phy_device struct
@@ -321,6 +493,9 @@ int genphy_c45_read_lpa(struct phy_device *phydev)
 {
 	int val;
 
+	if (genphy_c45_baset1_able(phydev))
+		return genphy_c45_baset1_read_lpa(phydev);
+
 	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
 	if (val < 0)
 		return val;
@@ -399,6 +574,17 @@ int genphy_c45_read_pma(struct phy_device *phydev)
 
 	phydev->duplex = DUPLEX_FULL;
 
+	if (genphy_c45_baset1_able(phydev)) {
+		val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_PMD_BT1_CTRL);
+		if (val < 0)
+			return val;
+
+		if (MDIO_PMA_PMD_BT1_CTRL_CFG_MST)
+			phydev->master_slave_state = MASTER_SLAVE_STATE_MASTER;
+		else
+			phydev->master_slave_state = MASTER_SLAVE_STATE_SLAVE;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(genphy_c45_read_pma);
@@ -530,12 +716,67 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev)
 					 phydev->supported,
 					 val & MDIO_PMA_NG_EXTABLE_5GBT);
 		}
+
+		if (val & MDIO_PMA_EXTABLE_BT1) {
+			val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_PMD_BT1);
+			if (val < 0)
+				return val;
+
+			linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT,
+					 phydev->supported,
+					 val & MDIO_PMA_PMD_BT1_B10L_ABLE);
+
+			val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_STAT);
+			if (val < 0)
+				return val;
+
+			linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					 phydev->supported,
+					 val & MDIO_AN_STAT1_ABLE);
+		}
 	}
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(genphy_c45_pma_read_abilities);
 
+/* Read master/slave preference from registers.
+ * The preference is read from the BIT(4) of BASE-T1 AN
+ * advertisement register 7.515 and whether the preference
+ * is forced or not, it is read from BASE-T1 AN advertisement
+ * register 7.514.
+ */
+static int genphy_c45_baset1_read_status(struct phy_device *phydev)
+{
+	int ret;
+	int cfg;
+
+	phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN;
+	phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_ADV_L);
+	if (ret < 0)
+		return ret;
+
+	cfg = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_T1_ADV_M);
+	if (cfg < 0)
+		return cfg;
+
+	if (ret & MDIO_AN_T1_ADV_L_FORCE_MS) {
+		if (cfg & MDIO_AN_T1_ADV_M_MST)
+			phydev->master_slave_get = MASTER_SLAVE_CFG_MASTER_FORCE;
+		else
+			phydev->master_slave_get = MASTER_SLAVE_CFG_SLAVE_FORCE;
+	} else {
+		if (cfg & MDIO_AN_T1_ADV_M_MST)
+			phydev->master_slave_get = MASTER_SLAVE_CFG_MASTER_PREFERRED;
+		else
+			phydev->master_slave_get = MASTER_SLAVE_CFG_SLAVE_PREFERRED;
+	}
+
+	return 0;
+}
+
 /**
  * genphy_c45_read_status - read PHY status
  * @phydev: target phy_device struct
@@ -560,6 +801,12 @@ int genphy_c45_read_status(struct phy_device *phydev)
 		if (ret)
 			return ret;
 
+		if (genphy_c45_baset1_able(phydev)) {
+			ret = genphy_c45_baset1_read_status(phydev);
+			if (ret < 0)
+				return ret;
+		}
+
 		phy_resolve_aneg_linkmode(phydev);
 	} else {
 		ret = genphy_c45_read_pma(phydev);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1369daeded14..431a8719c635 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -600,6 +600,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 
 	dev->autoneg = AUTONEG_ENABLE;
 
+	dev->pma_extable = -ENODATA;
 	dev->is_c45 = is_c45;
 	dev->phy_id = phy_id;
 	if (c45_ids)
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index ecac96d52e01..00177567cfef 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -340,6 +340,76 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising,
 			 advertising, lpa & MDIO_AN_10GBT_STAT_LP10G);
 }
 
+/**
+ * mii_t1_adv_l_mod_linkmode_t
+ * @advertising: target the linkmode advertisement settings
+ * @lpa: value of the BASE-T1 Autonegotiation Advertisement [15:0] Register
+ *
+ * A small helper function that translates BASE-T1 Autonegotiation
+ * Advertisement [15:0] Register bits to linkmode advertisement settings.
+ * Other bits in advertising aren't changed.
+ */
+static inline void mii_t1_adv_l_mod_linkmode_t(unsigned long *advertising, u32 lpa)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising,
+			 lpa & MDIO_AN_T1_ADV_L_PAUSE_CAP);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising,
+			 lpa & MDIO_AN_T1_ADV_L_PAUSE_ASYM);
+}
+
+/**
+ * mii_t1_adv_m_mod_linkmode_t
+ * @advertising: target the linkmode advertisement settings
+ * @lpa: value of the BASE-T1 Autonegotiation Advertisement [31:16] Register
+ *
+ * A small helper function that translates BASE-T1 Autonegotiation
+ * Advertisement [31:16] Register bits to linkmode advertisement settings.
+ * Other bits in advertising aren't changed.
+ */
+static inline void mii_t1_adv_m_mod_linkmode_t(unsigned long *advertising, u32 lpa)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT,
+			 advertising, lpa & MDIO_AN_T1_ADV_M_B10L);
+}
+
+/**
+ * linkmode_adv_to_mii_t1_adv_l_t
+ * @advertising: the linkmode advertisement settings
+ *
+ * A small helper function that translates linkmode advertisement
+ * settings to phy autonegotiation advertisements for the
+ * BASE-T1 Autonegotiation Advertisement [15:0] Register.
+ */
+static inline u32 linkmode_adv_to_mii_t1_adv_l_t(unsigned long *advertising)
+{
+	u32 result = 0;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising))
+		result |= MDIO_AN_T1_ADV_L_PAUSE_CAP;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising))
+		result |= MDIO_AN_T1_ADV_L_PAUSE_ASYM;
+
+	return result;
+}
+
+/**
+ * linkmode_adv_to_mii_t1_adv_m_t
+ * @advertising: the linkmode advertisement settings
+ *
+ * A small helper function that translates linkmode advertisement
+ * settings to phy autonegotiation advertisements for the
+ * BASE-T1 Autonegotiation Advertisement [31:16] Register.
+ */
+static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
+{
+	u32 result = 0;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising))
+		result |= MDIO_AN_T1_ADV_M_B10L;
+
+	return result;
+}
+
 int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
 int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
 int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b12af9e2f389..2d12054932ba 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -570,6 +570,7 @@ struct macsec_ops;
  * @autoneg_complete: Flag auto negotiation of the link has completed
  * @mdix: Current crossover
  * @mdix_ctrl: User setting of crossover
+ * @pma_extable: Cached value of PMA/PMD Extended Abilities Register
  * @interrupts: Flag interrupts have been enabled
  * @interface: enum phy_interface_t value
  * @skb: Netlink message for cable diagnostics
@@ -698,6 +699,8 @@ struct phy_device {
 	u8 mdix;
 	u8 mdix_ctrl;
 
+	int pma_extable;
+
 	void (*phy_link_change)(struct phy_device *phydev, bool up);
 	void (*adjust_link)(struct net_device *dev);
 
diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index fa3515257f54..75b7257a51e1 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -70,6 +70,7 @@
 #define MDIO_B10L_PMA_CTRL	2294	/* 10BASE-T1L PMA control */
 #define MDIO_PMA_10T1L_STAT	2295	/* 10BASE-T1L PMA status */
 #define MDIO_PCS_10T1L_CTRL	2278	/* 10BASE-T1L PCS control */
+#define MDIO_PMA_PMD_BT1	18	/* BASE-T1 PMA/PMD extended ability */
 #define MDIO_AN_T1_CTRL		512	/* BASE-T1 AN control */
 #define MDIO_AN_T1_STAT		513	/* BASE-T1 AN status */
 #define MDIO_AN_T1_ADV_L	514	/* BASE-T1 AN advertisement register [15:0] */
@@ -78,6 +79,7 @@
 #define MDIO_AN_T1_LP_L		517	/* BASE-T1 AN LP Base Page ability register [15:0] */
 #define MDIO_AN_T1_LP_M		518	/* BASE-T1 AN LP Base Page ability register [31:16] */
 #define MDIO_AN_T1_LP_H		519	/* BASE-T1 AN LP Base Page ability register [47:32] */
+#define MDIO_PMA_PMD_BT1_CTRL	2100	/* BASE-T1 PMA/PMD control register */
 
 /* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */
 #define MDIO_PMA_LASI_RXCTRL	0x9000	/* RX_ALARM control */
@@ -170,6 +172,7 @@
 #define MDIO_PMA_CTRL2_10BT		0x000f	/* 10BASE-T type */
 #define MDIO_PMA_CTRL2_2_5GBT		0x0030  /* 2.5GBaseT type */
 #define MDIO_PMA_CTRL2_5GBT		0x0031  /* 5GBaseT type */
+#define MDIO_PMA_CTRL2_BASET1		0x003D  /* BASE-T1 type */
 #define MDIO_PCS_CTRL2_TYPE		0x0003	/* PCS type selection */
 #define MDIO_PCS_CTRL2_10GBR		0x0000	/* 10GBASE-R type */
 #define MDIO_PCS_CTRL2_10GBX		0x0001	/* 10GBASE-X type */
@@ -223,6 +226,7 @@
 #define MDIO_PMA_EXTABLE_1000BKX	0x0040	/* 1000BASE-KX ability */
 #define MDIO_PMA_EXTABLE_100BTX		0x0080	/* 100BASE-TX ability */
 #define MDIO_PMA_EXTABLE_10BT		0x0100	/* 10BASE-T ability */
+#define MDIO_PMA_EXTABLE_BT1		0x0800	/* BASE-T1 ability */
 #define MDIO_PMA_EXTABLE_NBT		0x4000  /* 2.5/5GBASE-T ability */
 
 /* PHY XGXS lane state register. */
@@ -301,6 +305,9 @@
 #define MDIO_PCS_10T1L_CTRL_LB		0x4000	/* Enable PCS level loopback mode */
 #define MDIO_PCS_10T1L_CTRL_RESET	0x8000	/* PCS reset */
 
+/* BASE-T1 PMA/PMD extended ability register. */
+#define MDIO_PMA_PMD_BT1_B10L_ABLE	0x0004	/* 10BASE-T1L Ability */
+
 /* BASE-T1 auto-negotiation advertisement register [15:0] */
 #define MDIO_AN_T1_ADV_L_PAUSE_CAP	ADVERTISE_PAUSE_CAP
 #define MDIO_AN_T1_ADV_L_PAUSE_ASYM	ADVERTISE_PAUSE_ASYM
@@ -333,6 +340,9 @@
 #define MDIO_AN_T1_LP_H_10L_TX_HI_REQ	0x1000	/* 10BASE-T1L High Level LP Transmit Request */
 #define MDIO_AN_T1_LP_H_10L_TX_HI	0x2000	/* 10BASE-T1L High Level LP Transmit Ability */
 
+/* BASE-T1 PMA/PMD control register */
+#define MDIO_PMA_PMD_BT1_CTRL_CFG_MST	0x4000 /* MASTER-SLAVE config value */
+
 /* EEE Supported/Advertisement/LP Advertisement registers.
  *
  * EEE capability Register (3.20), Advertisement (7.60) and
-- 
cgit v1.2.3


From c2aa2dfef243efe213a480a1ee8566507a5152f4 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Tue, 3 May 2022 01:09:56 -0700
Subject: seccomp: Add wait_killable semantic to seccomp user notifier

This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
that makes it so that when notifications are received by the supervisor the
notifying process will transition to wait killable semantics. Although wait
killable isn't a set of semantics formally exposed to userspace, the
concept is searchable. If the notifying process is signaled prior to the
notification being received by the userspace agent, it will be handled as
normal.

One quirk about how this is handled is that the notifying process
only switches to TASK_KILLABLE if it receives a wakeup from either
an addfd or a signal. This is to avoid an unnecessary wakeup of
the notifying task.

The reasons behind switching into wait_killable only after userspace
receives the notification are:
* Avoiding unncessary work - Often, workloads will perform work that they
  may abort (request racing comes to mind). This allows for syscalls to be
  aborted safely prior to the notification being received by the
  supervisor. In this, the supervisor doesn't end up doing work that the
  workload does not want to complete anyways.
* Avoiding side effects - We don't want the syscall to be interruptible
  once the supervisor starts doing work because it may not be trivial
  to reverse the operation. For example, unmounting a file system may
  take a long time, and it's hard to rollback, or treat that as
  reentrant.
* Avoid breaking runtimes - Various runtimes do not GC when they are
  during a syscall (or while running native code that subsequently
  calls a syscall). If many notifications are blocked, and not picked
  up by the supervisor, this can get the application into a bad state.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
---
 Documentation/userspace-api/seccomp_filter.rst | 10 ++++++
 include/linux/seccomp.h                        |  3 +-
 include/uapi/linux/seccomp.h                   |  2 ++
 kernel/seccomp.c                               | 42 ++++++++++++++++++++++++--
 4 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 539e9d4a4860..d1e2b9193f09 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -271,6 +271,16 @@ notifying process it will be replaced. The supervisor can also add an FD, and
 respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
 value will be the injected file descriptor number.
 
+The notifying process can be preempted, resulting in the notification being
+aborted. This can be problematic when trying to take actions on behalf of the
+notifying process that are long-running and typically retryable (mounting a
+filesytem). Alternatively, at filter installation time, the
+``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
+such that when a user notification is received by the supervisor, the notifying
+process will ignore non-fatal signals until the response is sent. Signals that
+are sent prior to the notification being received by userspace are handled
+normally.
+
 It is worth noting that ``struct seccomp_data`` contains the values of register
 arguments to the syscall, but does not contain pointers to memory. The task's
 memory is accessible to suitably privileged traces via ``ptrace()`` or
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 0c564e5d40ff..d31d76be4982 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -8,7 +8,8 @@
 					 SECCOMP_FILTER_FLAG_LOG | \
 					 SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
 					 SECCOMP_FILTER_FLAG_NEW_LISTENER | \
-					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
+					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
+					 SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
 
 /* sizeof() the first published struct seccomp_notif_addfd */
 #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 78074254ab98..0fdc6ef02b94 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -23,6 +23,8 @@
 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH		(1UL << 4)
+/* Received notifications wait in killable state (only respond to fatal signals) */
+#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV	(1UL << 5)
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3caa0fe85235..e9852d1b4a5e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
  *	   the filter can be freed.
  * @cache: cache of arch/syscall mappings to actions
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ *			notification is received by the userspace listener.
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  * @notif: the struct that holds all notification related information
@@ -220,6 +222,7 @@ struct seccomp_filter {
 	refcount_t refs;
 	refcount_t users;
 	bool log;
+	bool wait_killable_recv;
 	struct action_cache cache;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
 	if (flags & SECCOMP_FILTER_FLAG_LOG)
 		filter->log = true;
 
+	/* Set wait killable flag, if present. */
+	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+		filter->wait_killable_recv = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
 	complete(&addfd->completion);
 }
 
+static bool should_sleep_killable(struct seccomp_filter *match,
+				  struct seccomp_knotif *n)
+{
+	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
 static int seccomp_do_user_notification(int this_syscall,
 					struct seccomp_filter *match,
 					const struct seccomp_data *sd)
@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
 	 * This is where we wait for a reply from userspace.
 	 */
 	do {
+		bool wait_killable = should_sleep_killable(match, &n);
+
 		mutex_unlock(&match->notify_lock);
-		err = wait_for_completion_interruptible(&n.ready);
+		if (wait_killable)
+			err = wait_for_completion_killable(&n.ready);
+		else
+			err = wait_for_completion_interruptible(&n.ready);
 		mutex_lock(&match->notify_lock);
-		if (err != 0)
+
+		if (err != 0) {
+			/*
+			 * Check to see if the notifcation got picked up and
+			 * whether we should switch to wait killable.
+			 */
+			if (!wait_killable && should_sleep_killable(match, &n))
+				continue;
+
 			goto interrupted;
+		}
 
 		addfd = list_first_entry_or_null(&n.addfd,
 						 struct seccomp_kaddfd, list);
@@ -1484,6 +1511,9 @@ out:
 		mutex_lock(&filter->notify_lock);
 		knotif = find_notification(filter, unotif.id);
 		if (knotif) {
+			/* Reset the process to make sure it's not stuck */
+			if (should_sleep_killable(filter, knotif))
+				complete(&knotif->ready);
 			knotif->state = SECCOMP_NOTIFY_INIT;
 			up(&filter->notif->request);
 		}
@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	    ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
 		return -EINVAL;
 
+	/*
+	 * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+	 * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+	 */
+	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+	    ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+		return -EINVAL;
+
 	/* Prepare the new filter before holding any locks. */
 	prepared = seccomp_prepare_user_filter(filter);
 	if (IS_ERR(prepared))
-- 
cgit v1.2.3


From 41b3c69bf9414375319290c59f198ff5c71d273f Mon Sep 17 00:00:00 2001
From: Kishen Maloor <kishen.maloor@intel.com>
Date: Mon, 2 May 2022 13:52:36 -0700
Subject: mptcp: expose server_side attribute in MPTCP netlink events

This change records the 'server_side' attribute of MPTCP_EVENT_CREATED
and MPTCP_EVENT_ESTABLISHED events to inform their recipient about the
Client/Server role of the running MPTCP application.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/246
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kishen Maloor <kishen.maloor@intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h | 1 +
 net/mptcp/pm_netlink.c     | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 9690efedb5fa..e41ea01a94bb 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -188,6 +188,7 @@ enum mptcp_event_attr {
 	MPTCP_ATTR_IF_IDX,	/* s32 */
 	MPTCP_ATTR_RESET_REASON,/* u32 */
 	MPTCP_ATTR_RESET_FLAGS, /* u32 */
+	MPTCP_ATTR_SERVER_SIDE,	/* u8 */
 
 	__MPTCP_ATTR_AFTER_LAST
 };
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index eeaa96bcae6c..a4430c576ce9 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1985,6 +1985,9 @@ static int mptcp_event_created(struct sk_buff *skb,
 	if (err)
 		return err;
 
+	if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
+		return -EMSGSIZE;
+
 	return mptcp_event_add_subflow(skb, ssk);
 }
 
-- 
cgit v1.2.3


From 7b33a09d036ffd9a04506122840629c7e870cf08 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Wed, 4 May 2022 03:24:40 +0000
Subject: KVM: arm64: Add support for userspace to suspend a vCPU

Introduce a new MP state, KVM_MP_STATE_SUSPENDED, which indicates a vCPU
is in a suspended state. In the suspended state the vCPU will block
until a wakeup event (pending interrupt) is recognized.

Add a new system event type, KVM_SYSTEM_EVENT_WAKEUP, to indicate to
userspace that KVM has recognized one such wakeup event. It is the
responsibility of userspace to then make the vCPU runnable, or leave it
suspended until the next wakeup event.

Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220504032446.4133305-7-oupton@google.com
---
 Documentation/virt/kvm/api.rst    | 37 ++++++++++++++++++++++++++--
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/kvm/arm.c              | 51 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  2 ++
 4 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 4a900cdbc62e..46ca84600dca 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1476,14 +1476,43 @@ Possible values are:
                                  [s390]
    KVM_MP_STATE_LOAD             the vcpu is in a special load/startup state
                                  [s390]
+   KVM_MP_STATE_SUSPENDED        the vcpu is in a suspend state and is waiting
+                                 for a wakeup event [arm64]
    ==========================    ===============================================
 
 On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 in-kernel irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
-For arm64/riscv:
-^^^^^^^^^^^^^^^^
+For arm64:
+^^^^^^^^^^
+
+If a vCPU is in the KVM_MP_STATE_SUSPENDED state, KVM will emulate the
+architectural execution of a WFI instruction.
+
+If a wakeup event is recognized, KVM will exit to userspace with a
+KVM_SYSTEM_EVENT exit, where the event type is KVM_SYSTEM_EVENT_WAKEUP. If
+userspace wants to honor the wakeup, it must set the vCPU's MP state to
+KVM_MP_STATE_RUNNABLE. If it does not, KVM will continue to await a wakeup
+event in subsequent calls to KVM_RUN.
+
+.. warning::
+
+     If userspace intends to keep the vCPU in a SUSPENDED state, it is
+     strongly recommended that userspace take action to suppress the
+     wakeup event (such as masking an interrupt). Otherwise, subsequent
+     calls to KVM_RUN will immediately exit with a KVM_SYSTEM_EVENT_WAKEUP
+     event and inadvertently waste CPU cycles.
+
+     Additionally, if userspace takes action to suppress a wakeup event,
+     it is strongly recommended that it also restores the vCPU to its
+     original state when the vCPU is made RUNNABLE again. For example,
+     if userspace masked a pending interrupt to suppress the wakeup,
+     the interrupt should be unmasked before returning control to the
+     guest.
+
+For riscv:
+^^^^^^^^^^
 
 The only states that are valid are KVM_MP_STATE_STOPPED and
 KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
@@ -5985,6 +6014,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_SHUTDOWN       1
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
+  #define KVM_SYSTEM_EVENT_WAKEUP         4
 			__u32 type;
                         __u32 ndata;
                         __u64 data[16];
@@ -6009,6 +6039,9 @@ Valid values for 'type' are:
    has requested a crash condition maintenance. Userspace can choose
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
+ - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
+   KVM has recognized a wakeup event. Userspace may honor this event by
+   marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
 
 If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
 architecture specific information for the system-level event.  Only
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f3f93d48e21a..46027b9b80ca 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 #define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
 #define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
+#define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 				     KVM_DIRTY_LOG_INITIALLY_SET)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index efe54aba5cce..abd32a84ed7a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -444,6 +444,18 @@ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
 	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
 }
 
+static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
+	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
@@ -464,6 +476,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	case KVM_MP_STATE_STOPPED:
 		kvm_arm_vcpu_power_off(vcpu);
 		break;
+	case KVM_MP_STATE_SUSPENDED:
+		kvm_arm_vcpu_suspend(vcpu);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -648,6 +663,39 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 	preempt_enable();
 }
 
+static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_arm_vcpu_suspended(vcpu))
+		return 1;
+
+	kvm_vcpu_wfi(vcpu);
+
+	/*
+	 * The suspend state is sticky; we do not leave it until userspace
+	 * explicitly marks the vCPU as runnable. Request that we suspend again
+	 * later.
+	 */
+	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+
+	/*
+	 * Check to make sure the vCPU is actually runnable. If so, exit to
+	 * userspace informing it of the wakeup condition.
+	 */
+	if (kvm_arch_vcpu_runnable(vcpu)) {
+		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
+		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+		return 0;
+	}
+
+	/*
+	 * Otherwise, we were unblocked to process a different event, such as a
+	 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
+	 * process the event.
+	 */
+	return 1;
+}
+
 /**
  * check_vcpu_requests - check and handle pending vCPU requests
  * @vcpu:	the VCPU pointer
@@ -686,6 +734,9 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
 			kvm_pmu_handle_pmcr(vcpu,
 					    __vcpu_sys_reg(vcpu, PMCR_EL0));
+
+		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
+			return kvm_vcpu_suspend(vcpu);
 	}
 
 	return 1;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6a184d260c7f..7f72fb7b05f2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -444,6 +444,7 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
+#define KVM_SYSTEM_EVENT_WAKEUP         4
 			__u32 type;
 			__u32 ndata;
 			union {
@@ -646,6 +647,7 @@ struct kvm_vapic_addr {
 #define KVM_MP_STATE_OPERATING         7
 #define KVM_MP_STATE_LOAD              8
 #define KVM_MP_STATE_AP_RESET_HOLD     9
+#define KVM_MP_STATE_SUSPENDED         10
 
 struct kvm_mp_state {
 	__u32 mp_state;
-- 
cgit v1.2.3


From bfbab44568779e1682bc6f63688bb9c965f0e74a Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Wed, 4 May 2022 03:24:41 +0000
Subject: KVM: arm64: Implement PSCI SYSTEM_SUSPEND

ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND" describes a PSCI call that allows
software to request that a system be placed in the deepest possible
low-power state. Effectively, software can use this to suspend itself to
RAM.

Unfortunately, there really is no good way to implement a system-wide
PSCI call in KVM. Any precondition checks done in the kernel will need
to be repeated by userspace since there is no good way to protect a
critical section that spans an exit to userspace. SYSTEM_RESET and
SYSTEM_OFF are equally plagued by this issue, although no users have
seemingly cared for the relatively long time these calls have been
supported.

The solution is to just make the whole implementation userspace's
problem. Introduce a new system event, KVM_SYSTEM_EVENT_SUSPEND, that
indicates to userspace a calling vCPU has invoked PSCI SYSTEM_SUSPEND.
Additionally, add a CAP to get buy-in from userspace for this new exit
type.

Only advertise the SYSTEM_SUSPEND PSCI call if userspace has opted in.
If a vCPU calls SYSTEM_SUSPEND, punt straight to userspace. Provide
explicit documentation of userspace's responsibilites for the exit and
point to the PSCI specification to describe the actual PSCI call.

Reviewed-by: Reiji Watanabe <reijiw@google.com>
Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220504032446.4133305-8-oupton@google.com
---
 Documentation/virt/kvm/api.rst    | 39 +++++++++++++++++++++++++++++++++++++++
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/arm.c              |  5 +++++
 arch/arm64/kvm/psci.c             | 29 +++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  2 ++
 5 files changed, 77 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 46ca84600dca..d8d7859fc556 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6015,6 +6015,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
   #define KVM_SYSTEM_EVENT_WAKEUP         4
+  #define KVM_SYSTEM_EVENT_SUSPEND        5
 			__u32 type;
                         __u32 ndata;
                         __u64 data[16];
@@ -6042,6 +6043,34 @@ Valid values for 'type' are:
  - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
    KVM has recognized a wakeup event. Userspace may honor this event by
    marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
+ - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested a suspension of
+   the VM.
+
+For arm/arm64:
+^^^^^^^^^^^^^^
+
+   KVM_SYSTEM_EVENT_SUSPEND exits are enabled with the
+   KVM_CAP_ARM_SYSTEM_SUSPEND VM capability. If a guest invokes the PSCI
+   SYSTEM_SUSPEND function, KVM will exit to userspace with this event
+   type.
+
+   It is the sole responsibility of userspace to implement the PSCI
+   SYSTEM_SUSPEND call according to ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND".
+   KVM does not change the vCPU's state before exiting to userspace, so
+   the call parameters are left in-place in the vCPU registers.
+
+   Userspace is _required_ to take action for such an exit. It must
+   either:
+
+    - Honor the guest request to suspend the VM. Userspace can request
+      in-kernel emulation of suspension by setting the calling vCPU's
+      state to KVM_MP_STATE_SUSPENDED. Userspace must configure the vCPU's
+      state according to the parameters passed to the PSCI function when
+      the calling vCPU is resumed. See ARM DEN0022D.b 5.19.1 "Intended use"
+      for details on the function parameters.
+
+    - Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2
+      "Caller responsibilities" for possible return values.
 
 If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
 architecture specific information for the system-level event.  Only
@@ -7767,6 +7796,16 @@ At this time, KVM_PMU_CAP_DISABLE is the only capability.  Setting
 this capability will disable PMU virtualization for that VM.  Usermode
 should adjust CPUID leaf 0xA to reflect that the PMU is disabled.
 
+8.36 KVM_CAP_ARM_SYSTEM_SUSPEND
+-------------------------------
+
+:Capability: KVM_CAP_ARM_SYSTEM_SUSPEND
+:Architectures: arm64
+:Type: vm
+
+When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
+type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
+
 9. Known KVM API problems
 =========================
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 46027b9b80ca..d9df81949f76 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -137,6 +137,8 @@ struct kvm_arch {
 	 */
 #define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED		3
 #define KVM_ARCH_FLAG_EL1_32BIT				4
+	/* PSCI SYSTEM_SUSPEND enabled for the guest */
+#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
 
 	unsigned long flags;
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index abd32a84ed7a..f8a89ae52710 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -97,6 +97,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_ARM_SYSTEM_SUSPEND:
+		r = 0;
+		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -210,6 +214,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
+	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 2e6f060214a7..5de30e72ad40 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -195,6 +195,15 @@ static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu)
 				 KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2);
 }
 
+static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	memset(&run->system_event, 0, sizeof(vcpu->run->system_event));
+	run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
+	run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
 static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -300,6 +309,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 {
 	unsigned long val = PSCI_RET_NOT_SUPPORTED;
 	u32 psci_fn = smccc_get_function(vcpu);
+	struct kvm *kvm = vcpu->kvm;
 	u32 arg;
 	int ret = 1;
 
@@ -331,6 +341,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 		case ARM_SMCCC_VERSION_FUNC_ID:
 			val = 0;
 			break;
+		case PSCI_1_0_FN_SYSTEM_SUSPEND:
+		case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+			if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags))
+				val = 0;
+			break;
 		case PSCI_1_1_FN_SYSTEM_RESET2:
 		case PSCI_1_1_FN64_SYSTEM_RESET2:
 			if (minor >= 1)
@@ -338,6 +353,20 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
 			break;
 		}
 		break;
+	case PSCI_1_0_FN_SYSTEM_SUSPEND:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+		/*
+		 * Return directly to userspace without changing the vCPU's
+		 * registers. Userspace depends on reading the SMCCC parameters
+		 * to implement SYSTEM_SUSPEND.
+		 */
+		if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) {
+			kvm_psci_system_suspend(vcpu);
+			return 0;
+		}
+		break;
 	case PSCI_1_1_FN_SYSTEM_RESET2:
 		kvm_psci_narrow_to_32bit(vcpu);
 		fallthrough;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7f72fb7b05f2..32c56384fd08 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -445,6 +445,7 @@ struct kvm_run {
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
 #define KVM_SYSTEM_EVENT_WAKEUP         4
+#define KVM_SYSTEM_EVENT_SUSPEND        5
 			__u32 type;
 			__u32 ndata;
 			union {
@@ -1154,6 +1155,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DISABLE_QUIRKS2 213
 /* #define KVM_CAP_VM_TSC_CONTROL 214 */
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
+#define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 2068339a6c35147847ba433fd0da67b313779059 Mon Sep 17 00:00:00 2001
From: Dipen Patel <dipenp@nvidia.com>
Date: Fri, 22 Apr 2022 13:52:18 -0700
Subject: gpiolib: cdev: Add hardware timestamp clock type

This patch adds new clock type for the GPIO controller which can
timestamp gpio lines in using hardware means. To expose such
functionalities to the userspace, code has been added where
during line create or set config API calls, it checks for new
clock type and if requested, calls HTE API. During line change
event, the HTE subsystem pushes timestamp data to userspace
through gpiolib-cdev.

Signed-off-by: Dipen Patel <dipenp@nvidia.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpio/gpiolib-cdev.c | 252 ++++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/gpio.h   |   3 +
 2 files changed, 222 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
index ffa0256cad5a..d8dba8c9d2cf 100644
--- a/drivers/gpio/gpiolib-cdev.c
+++ b/drivers/gpio/gpiolib-cdev.c
@@ -24,6 +24,7 @@
 #include <linux/timekeeping.h>
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
+#include <linux/hte.h>
 #include <uapi/linux/gpio.h>
 
 #include "gpiolib.h"
@@ -464,6 +465,25 @@ struct line {
 	 * stale value.
 	 */
 	unsigned int level;
+	/*
+	 * -- hte specific fields --
+	 */
+	struct hte_ts_desc hdesc;
+	/*
+	 * HTE provider sets line level at the time of event. The valid
+	 * value is 0 or 1 and negative value for an error.
+	 */
+	int raw_level;
+	/*
+	 * when sw_debounce is set on HTE enabled line, this is running
+	 * counter of the discarded events.
+	 */
+	u32 total_discard_seq;
+	/*
+	 * when sw_debounce is set on HTE enabled line, this variable records
+	 * last sequence number before debounce period expires.
+	 */
+	u32 last_seqno;
 };
 
 /**
@@ -518,6 +538,7 @@ struct linereq {
 	 GPIO_V2_LINE_DRIVE_FLAGS | \
 	 GPIO_V2_LINE_EDGE_FLAGS | \
 	 GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME | \
+	 GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE | \
 	 GPIO_V2_LINE_BIAS_FLAGS)
 
 static void linereq_put_event(struct linereq *lr,
@@ -542,10 +563,98 @@ static u64 line_event_timestamp(struct line *line)
 {
 	if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &line->desc->flags))
 		return ktime_get_real_ns();
+	else if (test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags))
+		return line->timestamp_ns;
 
 	return ktime_get_ns();
 }
 
+static enum hte_return process_hw_ts_thread(void *p)
+{
+	struct line *line;
+	struct linereq *lr;
+	struct gpio_v2_line_event le;
+	int level;
+	u64 eflags;
+
+	if (!p)
+		return HTE_CB_HANDLED;
+
+	line = p;
+	lr = line->req;
+
+	memset(&le, 0, sizeof(le));
+
+	le.timestamp_ns = line->timestamp_ns;
+	eflags = READ_ONCE(line->eflags);
+
+	if (eflags == GPIO_V2_LINE_FLAG_EDGE_BOTH) {
+		if (line->raw_level >= 0) {
+			if (test_bit(FLAG_ACTIVE_LOW, &line->desc->flags))
+				level = !line->raw_level;
+			else
+				level = line->raw_level;
+		} else {
+			level = gpiod_get_value_cansleep(line->desc);
+		}
+
+		if (level)
+			le.id = GPIO_V2_LINE_EVENT_RISING_EDGE;
+		else
+			le.id = GPIO_V2_LINE_EVENT_FALLING_EDGE;
+	} else if (eflags == GPIO_V2_LINE_FLAG_EDGE_RISING) {
+		/* Emit low-to-high event */
+		le.id = GPIO_V2_LINE_EVENT_RISING_EDGE;
+	} else if (eflags == GPIO_V2_LINE_FLAG_EDGE_FALLING) {
+		/* Emit high-to-low event */
+		le.id = GPIO_V2_LINE_EVENT_FALLING_EDGE;
+	} else {
+		return HTE_CB_HANDLED;
+	}
+	le.line_seqno = line->line_seqno;
+	le.seqno = (lr->num_lines == 1) ? le.line_seqno : line->req_seqno;
+	le.offset = gpio_chip_hwgpio(line->desc);
+
+	linereq_put_event(lr, &le);
+
+	return HTE_CB_HANDLED;
+}
+
+static enum hte_return process_hw_ts(struct hte_ts_data *ts, void *p)
+{
+	struct line *line;
+	struct linereq *lr;
+	int diff_seqno = 0;
+
+	if (!ts || !p)
+		return HTE_CB_HANDLED;
+
+	line = p;
+	line->timestamp_ns = ts->tsc;
+	line->raw_level = ts->raw_level;
+	lr = line->req;
+
+	if (READ_ONCE(line->sw_debounced)) {
+		line->total_discard_seq++;
+		line->last_seqno = ts->seq;
+		mod_delayed_work(system_wq, &line->work,
+		  usecs_to_jiffies(READ_ONCE(line->desc->debounce_period_us)));
+	} else {
+		if (unlikely(ts->seq < line->line_seqno))
+			return HTE_CB_HANDLED;
+
+		diff_seqno = ts->seq - line->line_seqno;
+		line->line_seqno = ts->seq;
+		if (lr->num_lines != 1)
+			line->req_seqno = atomic_add_return(diff_seqno,
+							    &lr->seqno);
+
+		return HTE_RUN_SECOND_CB;
+	}
+
+	return HTE_CB_HANDLED;
+}
+
 static irqreturn_t edge_irq_thread(int irq, void *p)
 {
 	struct line *line = p;
@@ -651,10 +760,16 @@ static void debounce_work_func(struct work_struct *work)
 	struct gpio_v2_line_event le;
 	struct line *line = container_of(work, struct line, work.work);
 	struct linereq *lr;
-	int level;
+	int level, diff_seqno;
 	u64 eflags;
 
-	level = gpiod_get_raw_value_cansleep(line->desc);
+	if (test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) {
+		level = line->raw_level;
+		if (level < 0)
+			level = gpiod_get_raw_value_cansleep(line->desc);
+	} else {
+		level = gpiod_get_raw_value_cansleep(line->desc);
+	}
 	if (level < 0) {
 		pr_debug_ratelimited("debouncer failed to read line value\n");
 		return;
@@ -685,10 +800,21 @@ static void debounce_work_func(struct work_struct *work)
 	lr = line->req;
 	le.timestamp_ns = line_event_timestamp(line);
 	le.offset = gpio_chip_hwgpio(line->desc);
-	line->line_seqno++;
-	le.line_seqno = line->line_seqno;
-	le.seqno = (lr->num_lines == 1) ?
-		le.line_seqno : atomic_inc_return(&lr->seqno);
+	if (test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) {
+		/* discard events except the last one */
+		line->total_discard_seq -= 1;
+		diff_seqno = line->last_seqno - line->total_discard_seq -
+				line->line_seqno;
+		line->line_seqno = line->last_seqno - line->total_discard_seq;
+		le.line_seqno = line->line_seqno;
+		le.seqno = (lr->num_lines == 1) ?
+			le.line_seqno : atomic_add_return(diff_seqno, &lr->seqno);
+	} else {
+		line->line_seqno++;
+		le.line_seqno = line->line_seqno;
+		le.seqno = (lr->num_lines == 1) ?
+			le.line_seqno : atomic_inc_return(&lr->seqno);
+	}
 
 	if (level)
 		/* Emit low-to-high event */
@@ -700,8 +826,34 @@ static void debounce_work_func(struct work_struct *work)
 	linereq_put_event(lr, &le);
 }
 
+static int hte_edge_setup(struct line *line, u64 eflags)
+{
+	int ret;
+	unsigned long flags = 0;
+	struct hte_ts_desc *hdesc = &line->hdesc;
+
+	if (eflags & GPIO_V2_LINE_FLAG_EDGE_RISING)
+		flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ?
+				  HTE_FALLING_EDGE_TS : HTE_RISING_EDGE_TS;
+	if (eflags & GPIO_V2_LINE_FLAG_EDGE_FALLING)
+		flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ?
+				  HTE_RISING_EDGE_TS : HTE_FALLING_EDGE_TS;
+
+	line->total_discard_seq = 0;
+
+	hte_init_line_attr(hdesc, desc_to_gpio(line->desc), flags,
+			   NULL, line->desc);
+
+	ret = hte_ts_get(NULL, hdesc, 0);
+	if (ret)
+		return ret;
+
+	return hte_request_ts_ns(hdesc, process_hw_ts,
+				 process_hw_ts_thread, line);
+}
+
 static int debounce_setup(struct line *line,
-			  unsigned int debounce_period_us)
+			  unsigned int debounce_period_us, bool hte_req)
 {
 	unsigned long irqflags;
 	int ret, level, irq;
@@ -721,19 +873,27 @@ static int debounce_setup(struct line *line,
 		if (level < 0)
 			return level;
 
-		irq = gpiod_to_irq(line->desc);
-		if (irq < 0)
-			return -ENXIO;
+		if (!hte_req) {
+			irq = gpiod_to_irq(line->desc);
+			if (irq < 0)
+				return -ENXIO;
 
-		WRITE_ONCE(line->level, level);
-		irqflags = IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING;
-		ret = request_irq(irq, debounce_irq_handler, irqflags,
-				  line->req->label, line);
-		if (ret)
-			return ret;
+			irqflags = IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING;
+			ret = request_irq(irq, debounce_irq_handler, irqflags,
+					  line->req->label, line);
+			if (ret)
+				return ret;
+			line->irq = irq;
+		} else {
+			ret = hte_edge_setup(line,
+					     GPIO_V2_LINE_FLAG_EDGE_RISING |
+					     GPIO_V2_LINE_FLAG_EDGE_FALLING);
+			if (ret)
+				return ret;
+		}
 
+		WRITE_ONCE(line->level, level);
 		WRITE_ONCE(line->sw_debounced, 1);
-		line->irq = irq;
 	}
 	return 0;
 }
@@ -766,13 +926,16 @@ static u32 gpio_v2_line_config_debounce_period(struct gpio_v2_line_config *lc,
 	return 0;
 }
 
-static void edge_detector_stop(struct line *line)
+static void edge_detector_stop(struct line *line, bool hte_en)
 {
-	if (line->irq) {
+	if (line->irq && !hte_en) {
 		free_irq(line->irq, line);
 		line->irq = 0;
 	}
 
+	if (hte_en)
+		hte_ts_put(&line->hdesc);
+
 	cancel_delayed_work_sync(&line->work);
 	WRITE_ONCE(line->sw_debounced, 0);
 	WRITE_ONCE(line->eflags, 0);
@@ -784,7 +947,7 @@ static void edge_detector_stop(struct line *line)
 static int edge_detector_setup(struct line *line,
 			       struct gpio_v2_line_config *lc,
 			       unsigned int line_idx,
-			       u64 eflags)
+			       u64 eflags, bool hte_req)
 {
 	u32 debounce_period_us;
 	unsigned long irqflags = 0;
@@ -799,7 +962,7 @@ static int edge_detector_setup(struct line *line,
 	WRITE_ONCE(line->eflags, eflags);
 	if (gpio_v2_line_config_debounced(lc, line_idx)) {
 		debounce_period_us = gpio_v2_line_config_debounce_period(lc, line_idx);
-		ret = debounce_setup(line, debounce_period_us);
+		ret = debounce_setup(line, debounce_period_us, hte_req);
 		if (ret)
 			return ret;
 		WRITE_ONCE(line->desc->debounce_period_us, debounce_period_us);
@@ -809,6 +972,9 @@ static int edge_detector_setup(struct line *line,
 	if (!eflags || READ_ONCE(line->sw_debounced))
 		return 0;
 
+	if (hte_req)
+		return hte_edge_setup(line, eflags);
+
 	irq = gpiod_to_irq(line->desc);
 	if (irq < 0)
 		return -ENXIO;
@@ -834,13 +1000,18 @@ static int edge_detector_setup(struct line *line,
 static int edge_detector_update(struct line *line,
 				struct gpio_v2_line_config *lc,
 				unsigned int line_idx,
-				u64 eflags, bool polarity_change)
+				u64 flags, bool polarity_change,
+				bool prev_hte_flag)
 {
+	u64 eflags = flags & GPIO_V2_LINE_EDGE_FLAGS;
 	unsigned int debounce_period_us =
-		gpio_v2_line_config_debounce_period(lc, line_idx);
+			gpio_v2_line_config_debounce_period(lc, line_idx);
+	bool hte_change = (prev_hte_flag !=
+		      ((flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE) != 0));
 
 	if ((READ_ONCE(line->eflags) == eflags) && !polarity_change &&
-	    (READ_ONCE(line->desc->debounce_period_us) == debounce_period_us))
+	    (READ_ONCE(line->desc->debounce_period_us) == debounce_period_us)
+	    && !hte_change)
 		return 0;
 
 	/* sw debounced and still will be...*/
@@ -851,11 +1022,12 @@ static int edge_detector_update(struct line *line,
 	}
 
 	/* reconfiguring edge detection or sw debounce being disabled */
-	if ((line->irq && !READ_ONCE(line->sw_debounced)) ||
+	if ((line->irq && !READ_ONCE(line->sw_debounced)) || prev_hte_flag ||
 	    (!debounce_period_us && READ_ONCE(line->sw_debounced)))
-		edge_detector_stop(line);
+		edge_detector_stop(line, prev_hte_flag);
 
-	return edge_detector_setup(line, lc, line_idx, eflags);
+	return edge_detector_setup(line, lc, line_idx, eflags,
+				   flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE);
 }
 
 static u64 gpio_v2_line_config_flags(struct gpio_v2_line_config *lc,
@@ -891,7 +1063,6 @@ static int gpio_v2_line_flags_validate(u64 flags)
 	/* Return an error if an unknown flag is set */
 	if (flags & ~GPIO_V2_LINE_VALID_FLAGS)
 		return -EINVAL;
-
 	/*
 	 * Do not allow both INPUT and OUTPUT flags to be set as they are
 	 * contradictory.
@@ -900,6 +1071,11 @@ static int gpio_v2_line_flags_validate(u64 flags)
 	    (flags & GPIO_V2_LINE_FLAG_OUTPUT))
 		return -EINVAL;
 
+	/* Only allow one event clock source */
+	if ((flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME) &&
+	    (flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE))
+		return -EINVAL;
+
 	/* Edge detection requires explicit input. */
 	if ((flags & GPIO_V2_LINE_EDGE_FLAGS) &&
 	    !(flags & GPIO_V2_LINE_FLAG_INPUT))
@@ -992,6 +1168,8 @@ static void gpio_v2_line_config_flags_to_desc_flags(u64 flags,
 
 	assign_bit(FLAG_EVENT_CLOCK_REALTIME, flagsp,
 		   flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME);
+	assign_bit(FLAG_EVENT_CLOCK_HTE, flagsp,
+		   flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE);
 }
 
 static long linereq_get_values(struct linereq *lr, void __user *ip)
@@ -1121,6 +1299,7 @@ static long linereq_set_config_unlocked(struct linereq *lr,
 	unsigned int i;
 	u64 flags;
 	bool polarity_change;
+	bool prev_hte_flag;
 	int ret;
 
 	for (i = 0; i < lr->num_lines; i++) {
@@ -1130,6 +1309,8 @@ static long linereq_set_config_unlocked(struct linereq *lr,
 			(!!test_bit(FLAG_ACTIVE_LOW, &desc->flags) !=
 			 ((flags & GPIO_V2_LINE_FLAG_ACTIVE_LOW) != 0));
 
+		prev_hte_flag = !!test_bit(FLAG_EVENT_CLOCK_HTE, &desc->flags);
+
 		gpio_v2_line_config_flags_to_desc_flags(flags, &desc->flags);
 		/*
 		 * Lines have to be requested explicitly for input
@@ -1138,7 +1319,7 @@ static long linereq_set_config_unlocked(struct linereq *lr,
 		if (flags & GPIO_V2_LINE_FLAG_OUTPUT) {
 			int val = gpio_v2_line_config_output_value(lc, i);
 
-			edge_detector_stop(&lr->lines[i]);
+			edge_detector_stop(&lr->lines[i], prev_hte_flag);
 			ret = gpiod_direction_output(desc, val);
 			if (ret)
 				return ret;
@@ -1148,8 +1329,7 @@ static long linereq_set_config_unlocked(struct linereq *lr,
 				return ret;
 
 			ret = edge_detector_update(&lr->lines[i], lc, i,
-					flags & GPIO_V2_LINE_EDGE_FLAGS,
-					polarity_change);
+					flags, polarity_change, prev_hte_flag);
 			if (ret)
 				return ret;
 		}
@@ -1278,9 +1458,12 @@ static ssize_t linereq_read(struct file *file,
 static void linereq_free(struct linereq *lr)
 {
 	unsigned int i;
+	bool hte;
 
 	for (i = 0; i < lr->num_lines; i++) {
-		edge_detector_stop(&lr->lines[i]);
+		hte = !!test_bit(FLAG_EVENT_CLOCK_HTE,
+				 &lr->lines[i].desc->flags);
+		edge_detector_stop(&lr->lines[i], hte);
 		if (lr->lines[i].desc)
 			gpiod_free(lr->lines[i].desc);
 	}
@@ -1406,7 +1589,8 @@ static int linereq_create(struct gpio_device *gdev, void __user *ip)
 				goto out_free_linereq;
 
 			ret = edge_detector_setup(&lr->lines[i], lc, i,
-					flags & GPIO_V2_LINE_EDGE_FLAGS);
+				flags & GPIO_V2_LINE_EDGE_FLAGS,
+				flags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE);
 			if (ret)
 				goto out_free_linereq;
 		}
@@ -1959,6 +2143,8 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc,
 
 	if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &desc->flags))
 		info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME;
+	else if (test_bit(FLAG_EVENT_CLOCK_HTE, &desc->flags))
+		info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE;
 
 	debounce_period_us = READ_ONCE(desc->debounce_period_us);
 	if (debounce_period_us) {
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
index eaaea3d8e6b4..cb9966d49a16 100644
--- a/include/uapi/linux/gpio.h
+++ b/include/uapi/linux/gpio.h
@@ -66,6 +66,8 @@ struct gpiochip_info {
  * @GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN: line has pull-down bias enabled
  * @GPIO_V2_LINE_FLAG_BIAS_DISABLED: line has bias disabled
  * @GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME: line events contain REALTIME timestamps
+ * @GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE: line events contain timestamps from
+ * hardware timestamp engine
  */
 enum gpio_v2_line_flag {
 	GPIO_V2_LINE_FLAG_USED			= _BITULL(0),
@@ -80,6 +82,7 @@ enum gpio_v2_line_flag {
 	GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN	= _BITULL(9),
 	GPIO_V2_LINE_FLAG_BIAS_DISABLED		= _BITULL(10),
 	GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME	= _BITULL(11),
+	GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE	= _BITULL(12),
 };
 
 /**
-- 
cgit v1.2.3


From 9ab4807c84a4aacfc9b4f79cc81254035e0ec361 Mon Sep 17 00:00:00 2001
From: Kishen Maloor <kishen.maloor@intel.com>
Date: Tue, 3 May 2022 19:38:52 -0700
Subject: mptcp: netlink: Add MPTCP_PM_CMD_ANNOUNCE

This change adds a MPTCP netlink interface for issuing
ADD_ADDR advertisements over the chosen MPTCP connection from a
userspace path manager.

The command requires the following parameters:
{ token, { loc_id, family, daddr4 | daddr6 [, dport] } [, if_idx],
flags[signal] }.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kishen Maloor <kishen.maloor@intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mptcp.h |  2 ++
 net/mptcp/pm_netlink.c     | 16 ++++++++----
 net/mptcp/pm_userspace.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h       |  7 ++++++
 4 files changed, 81 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index e41ea01a94bb..ac66c1263f02 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -55,6 +55,7 @@ enum {
 	MPTCP_PM_ATTR_ADDR,				/* nested address */
 	MPTCP_PM_ATTR_RCV_ADD_ADDRS,			/* u32 */
 	MPTCP_PM_ATTR_SUBFLOWS,				/* u32 */
+	MPTCP_PM_ATTR_TOKEN,				/* u32 */
 
 	__MPTCP_PM_ATTR_MAX
 };
@@ -93,6 +94,7 @@ enum {
 	MPTCP_PM_CMD_SET_LIMITS,
 	MPTCP_PM_CMD_GET_LIMITS,
 	MPTCP_PM_CMD_SET_FLAGS,
+	MPTCP_PM_CMD_ANNOUNCE,
 
 	__MPTCP_PM_CMD_AFTER_LAST
 };
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 7d9bed536966..dbe5ccd95ac5 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -352,8 +352,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
 	return entry;
 }
 
-static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
-				     const struct mptcp_pm_addr_entry *entry)
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+			      const struct mptcp_pm_addr_entry *entry)
 {
 	struct mptcp_pm_add_entry *add_entry = NULL;
 	struct sock *sk = (struct sock *)msk;
@@ -1094,6 +1094,7 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
 					NLA_POLICY_NESTED(mptcp_pm_addr_policy),
 	[MPTCP_PM_ATTR_RCV_ADD_ADDRS]	= { .type	= NLA_U32,	},
 	[MPTCP_PM_ATTR_SUBFLOWS]	= { .type	= NLA_U32,	},
+	[MPTCP_PM_ATTR_TOKEN]		= { .type	= NLA_U32,	},
 };
 
 void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
@@ -1203,9 +1204,9 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
 	return err;
 }
 
-static int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
-				bool require_family,
-				struct mptcp_pm_addr_entry *entry)
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+			 bool require_family,
+			 struct mptcp_pm_addr_entry *entry)
 {
 	struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
 	int err;
@@ -2198,6 +2199,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
 		.doit   = mptcp_nl_cmd_set_flags,
 		.flags  = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd    = MPTCP_PM_CMD_ANNOUNCE,
+		.doit   = mptcp_nl_cmd_announce,
+		.flags  = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family mptcp_genl_family __ro_after_init = {
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 910116b0f5b9..347184a9157b 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -119,3 +119,64 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
 
 	return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry);
 }
+
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+	struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct mptcp_pm_addr_entry addr_val;
+	struct mptcp_sock *msk;
+	int err = -EINVAL;
+	u32 token_val;
+
+	if (!addr || !token) {
+		GENL_SET_ERR_MSG(info, "missing required inputs");
+		return err;
+	}
+
+	token_val = nla_get_u32(token);
+
+	msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+	if (!msk) {
+		NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+		return err;
+	}
+
+	if (!mptcp_pm_is_userspace(msk)) {
+		GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+		goto announce_err;
+	}
+
+	err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
+	if (err < 0) {
+		GENL_SET_ERR_MSG(info, "error parsing local address");
+		goto announce_err;
+	}
+
+	if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+		GENL_SET_ERR_MSG(info, "invalid addr id or flags");
+		goto announce_err;
+	}
+
+	err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val);
+	if (err < 0) {
+		GENL_SET_ERR_MSG(info, "did not match address and id");
+		goto announce_err;
+	}
+
+	lock_sock((struct sock *)msk);
+	spin_lock_bh(&msk->pm.lock);
+
+	if (mptcp_pm_alloc_anno_list(msk, &addr_val)) {
+		mptcp_pm_announce_addr(msk, &addr_val.addr, false);
+		mptcp_pm_nl_addr_send_ack(msk);
+	}
+
+	spin_unlock_bh(&msk->pm.lock);
+	release_sock((struct sock *)msk);
+
+	err = 0;
+ announce_err:
+	sock_put((struct sock *)msk);
+	return err;
+}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7257dc7aed43..de645efbc806 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -11,6 +11,7 @@
 #include <net/tcp.h>
 #include <net/inet_connection_sock.h>
 #include <uapi/linux/mptcp.h>
+#include <net/genetlink.h>
 
 #define MPTCP_SUPPORTED_VERSION	1
 
@@ -755,6 +756,9 @@ u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
 void __init mptcp_pm_init(void);
 void mptcp_pm_data_init(struct mptcp_sock *msk);
 void mptcp_pm_data_reset(struct mptcp_sock *msk);
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+			 bool require_family,
+			 struct mptcp_pm_addr_entry *entry);
 void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
 void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
 void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
@@ -775,6 +779,8 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
 			       const struct mptcp_rm_list *rm_list);
 void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
 void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+			      const struct mptcp_pm_addr_entry *entry);
 void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
 bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
 struct mptcp_pm_add_entry *
@@ -798,6 +804,7 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *
 int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
 					     struct mptcp_pm_addr_entry *entry);
 void mptcp_free_local_addr_list(struct mptcp_sock *msk);
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
 
 void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
 		 const struct sock *ssk, gfp_t gfp);
-- 
cgit v1.2.3


From d9a4594edabf125dc17dfd52acc722c3de1cb44c Mon Sep 17 00:00:00 2001
From: Kishen Maloor <kishen.maloor@intel.com>
Date: Tue, 3 May 2022 19:38:54 -0700
Subject: mptcp: netlink: Add MPTCP_PM_CMD_REMOVE

This change adds a MPTCP netlink command for issuing a
REMOVE_ADDR signal for an address over the chosen MPTCP
connection from a userspace path manager.

The command requires the following parameters: {token, loc_id}.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kishen Maloor <kishen.maloor@intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mptcp.h |  2 ++
 net/mptcp/pm_netlink.c     | 10 ++++++--
 net/mptcp/pm_userspace.c   | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h       |  4 +++
 4 files changed, 76 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index ac66c1263f02..11f9fa001a3c 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -56,6 +56,7 @@ enum {
 	MPTCP_PM_ATTR_RCV_ADD_ADDRS,			/* u32 */
 	MPTCP_PM_ATTR_SUBFLOWS,				/* u32 */
 	MPTCP_PM_ATTR_TOKEN,				/* u32 */
+	MPTCP_PM_ATTR_LOC_ID,				/* u8 */
 
 	__MPTCP_PM_ATTR_MAX
 };
@@ -95,6 +96,7 @@ enum {
 	MPTCP_PM_CMD_GET_LIMITS,
 	MPTCP_PM_CMD_SET_FLAGS,
 	MPTCP_PM_CMD_ANNOUNCE,
+	MPTCP_PM_CMD_REMOVE,
 
 	__MPTCP_PM_CMD_AFTER_LAST
 };
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index dbe5ccd95ac5..a26750f19f65 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1095,6 +1095,7 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
 	[MPTCP_PM_ATTR_RCV_ADD_ADDRS]	= { .type	= NLA_U32,	},
 	[MPTCP_PM_ATTR_SUBFLOWS]	= { .type	= NLA_U32,	},
 	[MPTCP_PM_ATTR_TOKEN]		= { .type	= NLA_U32,	},
+	[MPTCP_PM_ATTR_LOC_ID]		= { .type	= NLA_U8,	},
 };
 
 void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
@@ -1504,8 +1505,8 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
 	return ret;
 }
 
-static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
-					       struct list_head *rm_list)
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+					struct list_head *rm_list)
 {
 	struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
 	struct mptcp_pm_addr_entry *entry;
@@ -2204,6 +2205,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
 		.doit   = mptcp_nl_cmd_announce,
 		.flags  = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd    = MPTCP_PM_CMD_REMOVE,
+		.doit   = mptcp_nl_cmd_remove,
+		.flags  = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family mptcp_genl_family __ro_after_init = {
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 347184a9157b..3a42c9e66126 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -180,3 +180,65 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
 	sock_put((struct sock *)msk);
 	return err;
 }
+
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+	struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
+	struct mptcp_pm_addr_entry *match = NULL;
+	struct mptcp_pm_addr_entry *entry;
+	struct mptcp_sock *msk;
+	LIST_HEAD(free_list);
+	int err = -EINVAL;
+	u32 token_val;
+	u8 id_val;
+
+	if (!id || !token) {
+		GENL_SET_ERR_MSG(info, "missing required inputs");
+		return err;
+	}
+
+	id_val = nla_get_u8(id);
+	token_val = nla_get_u32(token);
+
+	msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+	if (!msk) {
+		NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+		return err;
+	}
+
+	if (!mptcp_pm_is_userspace(msk)) {
+		GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+		goto remove_err;
+	}
+
+	lock_sock((struct sock *)msk);
+
+	list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+		if (entry->addr.id == id_val) {
+			match = entry;
+			break;
+		}
+	}
+
+	if (!match) {
+		GENL_SET_ERR_MSG(info, "address with specified id not found");
+		release_sock((struct sock *)msk);
+		goto remove_err;
+	}
+
+	list_move(&match->list, &free_list);
+
+	mptcp_pm_remove_addrs_and_subflows(msk, &free_list);
+
+	release_sock((struct sock *)msk);
+
+	list_for_each_entry_safe(match, entry, &free_list, list) {
+		sock_kfree_s((struct sock *)msk, match, sizeof(*match));
+	}
+
+	err = 0;
+ remove_err:
+	sock_put((struct sock *)msk);
+	return err;
+}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index de645efbc806..4026aa3df7f4 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -801,10 +801,14 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
 			   bool echo);
 int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
 int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+					struct list_head *rm_list);
+
 int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
 					     struct mptcp_pm_addr_entry *entry);
 void mptcp_free_local_addr_list(struct mptcp_sock *msk);
 int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
 
 void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
 		 const struct sock *ssk, gfp_t gfp);
-- 
cgit v1.2.3


From 702c2f646d42cfd9e31133d68a8283fea48fd810 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 3 May 2022 19:38:56 -0700
Subject: mptcp: netlink: allow userspace-driven subflow establishment

This allows userspace to tell kernel to add a new subflow to an existing
mptcp connection.

Userspace provides the token to identify the mptcp-level connection
that needs a change in active subflows and the local and remote
addresses of the new or the to-be-removed subflow.

MPTCP_PM_CMD_SUBFLOW_CREATE requires the following parameters:
{ token, { loc_id, family, loc_addr4 | loc_addr6 }, { family, rem_addr4 |
rem_addr6, rem_port }

MPTCP_PM_CMD_SUBFLOW_DESTROY requires the following parameters:
{ token, { family, loc_addr4 | loc_addr6, loc_port }, { family, rem_addr4 |
rem_addr6, rem_port }

Acked-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Kishen Maloor <kishen.maloor@intel.com>
Signed-off-by: Kishen Maloor <kishen.maloor@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mptcp.h |   3 +
 net/mptcp/pm_netlink.c     |  22 ++++++
 net/mptcp/pm_userspace.c   | 185 +++++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h       |   4 +
 4 files changed, 214 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 11f9fa001a3c..921963589904 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -57,6 +57,7 @@ enum {
 	MPTCP_PM_ATTR_SUBFLOWS,				/* u32 */
 	MPTCP_PM_ATTR_TOKEN,				/* u32 */
 	MPTCP_PM_ATTR_LOC_ID,				/* u8 */
+	MPTCP_PM_ATTR_ADDR_REMOTE,			/* nested address */
 
 	__MPTCP_PM_ATTR_MAX
 };
@@ -97,6 +98,8 @@ enum {
 	MPTCP_PM_CMD_SET_FLAGS,
 	MPTCP_PM_CMD_ANNOUNCE,
 	MPTCP_PM_CMD_REMOVE,
+	MPTCP_PM_CMD_SUBFLOW_CREATE,
+	MPTCP_PM_CMD_SUBFLOW_DESTROY,
 
 	__MPTCP_PM_CMD_AFTER_LAST
 };
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a26750f19f65..e099f2a12504 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1096,6 +1096,8 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
 	[MPTCP_PM_ATTR_SUBFLOWS]	= { .type	= NLA_U32,	},
 	[MPTCP_PM_ATTR_TOKEN]		= { .type	= NLA_U32,	},
 	[MPTCP_PM_ATTR_LOC_ID]		= { .type	= NLA_U8,	},
+	[MPTCP_PM_ATTR_ADDR_REMOTE]	=
+					NLA_POLICY_NESTED(mptcp_pm_addr_policy),
 };
 
 void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
@@ -1205,6 +1207,16 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
 	return err;
 }
 
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+			struct mptcp_addr_info *addr)
+{
+	struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+
+	memset(addr, 0, sizeof(*addr));
+
+	return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true);
+}
+
 int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
 			 bool require_family,
 			 struct mptcp_pm_addr_entry *entry)
@@ -2210,6 +2222,16 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
 		.doit   = mptcp_nl_cmd_remove,
 		.flags  = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd    = MPTCP_PM_CMD_SUBFLOW_CREATE,
+		.doit   = mptcp_nl_cmd_sf_create,
+		.flags  = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd    = MPTCP_PM_CMD_SUBFLOW_DESTROY,
+		.doit   = mptcp_nl_cmd_sf_destroy,
+		.flags  = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family mptcp_genl_family __ro_after_init = {
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 3a42c9e66126..f56378e4f597 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -242,3 +242,188 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
 	sock_put((struct sock *)msk);
 	return err;
 }
+
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+	struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+	struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct mptcp_addr_info addr_r;
+	struct mptcp_addr_info addr_l;
+	struct mptcp_sock *msk;
+	int err = -EINVAL;
+	struct sock *sk;
+	u32 token_val;
+
+	if (!laddr || !raddr || !token) {
+		GENL_SET_ERR_MSG(info, "missing required inputs");
+		return err;
+	}
+
+	token_val = nla_get_u32(token);
+
+	msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+	if (!msk) {
+		NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+		return err;
+	}
+
+	if (!mptcp_pm_is_userspace(msk)) {
+		GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+		goto create_err;
+	}
+
+	err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+	if (err < 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+		goto create_err;
+	}
+
+	if (addr_l.id == 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id");
+		goto create_err;
+	}
+
+	err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+	if (err < 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+		goto create_err;
+	}
+
+	sk = &msk->sk.icsk_inet.sk;
+	lock_sock(sk);
+
+	err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
+
+	release_sock(sk);
+
+ create_err:
+	sock_put((struct sock *)msk);
+	return err;
+}
+
+static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
+				      const struct mptcp_addr_info *local,
+				      const struct mptcp_addr_info *remote)
+{
+	struct sock *sk = &msk->sk.icsk_inet.sk;
+	struct mptcp_subflow_context *subflow;
+	struct sock *found = NULL;
+
+	if (local->family != remote->family)
+		return NULL;
+
+	lock_sock(sk);
+
+	mptcp_for_each_subflow(msk, subflow) {
+		const struct inet_sock *issk;
+		struct sock *ssk;
+
+		ssk = mptcp_subflow_tcp_sock(subflow);
+
+		if (local->family != ssk->sk_family)
+			continue;
+
+		issk = inet_sk(ssk);
+
+		switch (ssk->sk_family) {
+		case AF_INET:
+			if (issk->inet_saddr != local->addr.s_addr ||
+			    issk->inet_daddr != remote->addr.s_addr)
+				continue;
+			break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+		case AF_INET6: {
+			const struct ipv6_pinfo *pinfo = inet6_sk(ssk);
+
+			if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) ||
+			    !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr))
+				continue;
+			break;
+		}
+#endif
+		default:
+			continue;
+		}
+
+		if (issk->inet_sport == local->port &&
+		    issk->inet_dport == remote->port) {
+			found = ssk;
+			goto found;
+		}
+	}
+
+found:
+	release_sock(sk);
+
+	return found;
+}
+
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+	struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+	struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+	struct mptcp_addr_info addr_l;
+	struct mptcp_addr_info addr_r;
+	struct mptcp_sock *msk;
+	struct sock *sk, *ssk;
+	int err = -EINVAL;
+	u32 token_val;
+
+	if (!laddr || !raddr || !token) {
+		GENL_SET_ERR_MSG(info, "missing required inputs");
+		return err;
+	}
+
+	token_val = nla_get_u32(token);
+
+	msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+	if (!msk) {
+		NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+		return err;
+	}
+
+	if (!mptcp_pm_is_userspace(msk)) {
+		GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+		goto destroy_err;
+	}
+
+	err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+	if (err < 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+		goto destroy_err;
+	}
+
+	err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+	if (err < 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+		goto destroy_err;
+	}
+
+	if (addr_l.family != addr_r.family) {
+		GENL_SET_ERR_MSG(info, "address families do not match");
+		goto destroy_err;
+	}
+
+	if (!addr_l.port || !addr_r.port) {
+		GENL_SET_ERR_MSG(info, "missing local or remote port");
+		goto destroy_err;
+	}
+
+	sk = &msk->sk.icsk_inet.sk;
+	ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
+	if (ssk) {
+		struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+		mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+		mptcp_close_ssk(sk, ssk, subflow);
+		err = 0;
+	} else {
+		err = -ESRCH;
+	}
+
+ destroy_err:
+	sock_put((struct sock *)msk);
+	return err;
+}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 4026aa3df7f4..f542aeaa5b09 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -756,6 +756,8 @@ u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
 void __init mptcp_pm_init(void);
 void mptcp_pm_data_init(struct mptcp_sock *msk);
 void mptcp_pm_data_reset(struct mptcp_sock *msk);
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+			struct mptcp_addr_info *addr);
 int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
 			 bool require_family,
 			 struct mptcp_pm_addr_entry *entry);
@@ -809,6 +811,8 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
 void mptcp_free_local_addr_list(struct mptcp_sock *msk);
 int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
 int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info);
 
 void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
 		 const struct sock *ssk, gfp_t gfp);
-- 
cgit v1.2.3


From 36f8423597000bd7d5e48b7b306e1d0958e72359 Mon Sep 17 00:00:00 2001
From: Muna Sinada <quic_msinada@quicinc.com>
Date: Wed, 23 Mar 2022 15:46:35 -0700
Subject: cfg80211: support disabling EHT mode

Allow userspace to disable EHT mode during association.

Signed-off-by: Muna Sinada <quic_msinada@quicinc.com>
Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://lore.kernel.org/r/20220323224636.20211-1-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 7 +++++++
 3 files changed, 11 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cd1212113901..6a3e3f0a8615 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2735,6 +2735,7 @@ struct cfg80211_auth_request {
  *	userspace if this flag is set. Only applicable for cfg80211_connect()
  *	request (connect callback).
  * @ASSOC_REQ_DISABLE_HE:  Disable HE
+ * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -2742,6 +2743,7 @@ enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_USE_RRM			= BIT(2),
 	CONNECT_REQ_EXTERNAL_AUTH_SUPPORT	= BIT(3),
 	ASSOC_REQ_DISABLE_HE			= BIT(4),
+	ASSOC_REQ_DISABLE_EHT			= BIT(5),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0568a79097b8..d9490e3062a7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3175,6 +3175,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_EHT_CAPABILITY,
 
+	NL80211_ATTR_DISABLE_EHT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 174f254ee947..2c64baae9863 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -791,6 +791,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_RANGE(NLA_BINARY,
 				 NL80211_EHT_MIN_CAPABILITY_LEN,
 				 NL80211_EHT_MAX_CAPABILITY_LEN),
+	[NL80211_ATTR_DISABLE_EHT] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -10378,6 +10379,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE]))
 		req.flags |= ASSOC_REQ_DISABLE_HE;
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
+		req.flags |= ASSOC_REQ_DISABLE_EHT;
+
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
 		memcpy(&req.vht_capa_mask,
 		       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
@@ -11166,6 +11170,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE]))
 		connect.flags |= ASSOC_REQ_DISABLE_HE;
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
+		connect.flags |= ASSOC_REQ_DISABLE_EHT;
+
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
 		memcpy(&connect.vht_capa_mask,
 		       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
-- 
cgit v1.2.3


From c4a67a21a6d255ddcbaa076c0412aad73c7e0c02 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 4 May 2022 08:40:37 -0700
Subject: Revert "Merge branch 'mlxsw-line-card-model'"

This reverts commit 5e927a9f4b9f29d78a7c7d66ea717bb5c8bbad8e, reversing
changes made to cfc1d91a7d78cf9de25b043d81efcc16966d55b3.

The discussion is still ongoing so let's remove the uAPI
until the discussion settles.

Link: https://lore.kernel.org/all/20220425090021.32e9a98f@kernel.org/
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220504154037.539442-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/devlink/devlink-linecard.rst        |   4 -
 Documentation/networking/devlink/mlxsw.rst         |  33 ---
 drivers/net/ethernet/mellanox/mlxsw/core.h         |   1 -
 .../net/ethernet/mellanox/mlxsw/core_linecards.c   | 237 +---------------
 drivers/net/ethernet/mellanox/mlxsw/reg.h          |  87 +-----
 include/net/devlink.h                              |  18 +-
 include/uapi/linux/devlink.h                       |   5 -
 net/core/devlink.c                                 | 303 +--------------------
 .../drivers/net/mlxsw/devlink_linecard.sh          |  61 -----
 9 files changed, 10 insertions(+), 739 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/devlink/devlink-linecard.rst b/Documentation/networking/devlink/devlink-linecard.rst
index a98b468ad479..6c0b8928bc13 100644
--- a/Documentation/networking/devlink/devlink-linecard.rst
+++ b/Documentation/networking/devlink/devlink-linecard.rst
@@ -14,7 +14,6 @@ system. Following operations are provided:
   * Get a list of supported line card types.
   * Provision of a slot with specific line card type.
   * Get and monitor of line card state and its change.
-  * Get information about line card versions and devices.
 
 Line card according to the type may contain one or more gearboxes
 to mux the lanes with certain speed to multiple ports with lanes
@@ -121,6 +120,3 @@ Example usage
 
     # Set slot 8 to be unprovisioned:
     $ devlink lc set pci/0000:01:00.0 lc 8 notype
-
-    # Set info for slot 8:
-    $ devlink lc info pci/0000:01:00.0 lc 8
diff --git a/Documentation/networking/devlink/mlxsw.rst b/Documentation/networking/devlink/mlxsw.rst
index 0af345680510..cf857cb4ba8f 100644
--- a/Documentation/networking/devlink/mlxsw.rst
+++ b/Documentation/networking/devlink/mlxsw.rst
@@ -58,39 +58,6 @@ The ``mlxsw`` driver reports the following versions
      - running
      - Three digit firmware version
 
-Line card info versions
-=======================
-
-The ``mlxsw`` driver reports the following versions for line cards
-
-.. list-table:: devlink line card info versions implemented
-   :widths: 5 5 90
-
-   * - Name
-     - Type
-     - Description
-   * - ``hw.revision``
-     - fixed
-     - The hardware revision for this line card
-   * - ``ini.version``
-     - running
-     - Version of line card INI loaded
-
-Line card device info versions
-==============================
-
-The ``mlxsw`` driver reports the following versions for line card devices
-
-.. list-table:: devlink line card device info versions implemented
-   :widths: 5 5 90
-
-   * - Name
-     - Type
-     - Description
-   * - ``fw.version``
-     - running
-     - Three digit firmware version
-
 Driver-specific Traps
 =====================
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index d008282d7f2e..c2a891287047 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -581,7 +581,6 @@ struct mlxsw_linecard {
 	   active:1;
 	u16 hw_revision;
 	u16 ini_version;
-	struct list_head device_list;
 };
 
 struct mlxsw_linecard_types_info;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
index 2abd31a62776..5c9869dcf674 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
@@ -87,191 +87,11 @@ static const char *mlxsw_linecard_type_name(struct mlxsw_linecard *linecard)
 	return linecard->name;
 }
 
-struct mlxsw_linecard_device_info {
-	u16 fw_major;
-	u16 fw_minor;
-	u16 fw_sub_minor;
-};
-
-struct mlxsw_linecard_device {
-	struct list_head list;
-	u8 index;
-	struct mlxsw_linecard *linecard;
-	struct devlink_linecard_device *devlink_device;
-	struct mlxsw_linecard_device_info info;
-};
-
-static struct mlxsw_linecard_device *
-mlxsw_linecard_device_lookup(struct mlxsw_linecard *linecard, u8 index)
-{
-	struct mlxsw_linecard_device *device;
-
-	list_for_each_entry(device, &linecard->device_list, list)
-		if (device->index == index)
-			return device;
-	return NULL;
-}
-
-static int mlxsw_linecard_device_attach(struct mlxsw_core *mlxsw_core,
-					struct mlxsw_linecard *linecard,
-					u8 device_index, bool flash_owner)
-{
-	struct mlxsw_linecard_device *device;
-	int err;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return -ENOMEM;
-	device->index = device_index;
-	device->linecard = linecard;
-
-	device->devlink_device = devlink_linecard_device_create(linecard->devlink_linecard,
-								device_index, device);
-	if (IS_ERR(device->devlink_device)) {
-		err = PTR_ERR(device->devlink_device);
-		goto err_devlink_linecard_device_attach;
-	}
-
-	list_add_tail(&device->list, &linecard->device_list);
-	return 0;
-
-err_devlink_linecard_device_attach:
-	kfree(device);
-	return err;
-}
-
-static void mlxsw_linecard_device_detach(struct mlxsw_core *mlxsw_core,
-					 struct mlxsw_linecard *linecard,
-					 struct mlxsw_linecard_device *device)
-{
-	list_del(&device->list);
-	devlink_linecard_device_destroy(linecard->devlink_linecard,
-					device->devlink_device);
-	kfree(device);
-}
-
-static void mlxsw_linecard_devices_detach(struct mlxsw_linecard *linecard)
-{
-	struct mlxsw_core *mlxsw_core = linecard->linecards->mlxsw_core;
-	struct mlxsw_linecard_device *device, *tmp;
-
-	list_for_each_entry_safe(device, tmp, &linecard->device_list, list)
-		mlxsw_linecard_device_detach(mlxsw_core, linecard, device);
-}
-
-static int mlxsw_linecard_devices_attach(struct mlxsw_linecard *linecard)
-{
-	struct mlxsw_core *mlxsw_core = linecard->linecards->mlxsw_core;
-	u8 msg_seq = 0;
-	int err;
-
-	do {
-		char mddq_pl[MLXSW_REG_MDDQ_LEN];
-		bool flash_owner;
-		bool data_valid;
-		u8 device_index;
-
-		mlxsw_reg_mddq_device_info_pack(mddq_pl, linecard->slot_index,
-						msg_seq);
-		err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mddq), mddq_pl);
-		if (err)
-			return err;
-		mlxsw_reg_mddq_device_info_unpack(mddq_pl, &msg_seq,
-						  &data_valid, &flash_owner,
-						  &device_index, NULL,
-						  NULL, NULL);
-		if (!data_valid)
-			break;
-		err = mlxsw_linecard_device_attach(mlxsw_core, linecard,
-						   device_index, flash_owner);
-		if (err)
-			goto rollback;
-	} while (msg_seq);
-
-	return 0;
-
-rollback:
-	mlxsw_linecard_devices_detach(linecard);
-	return err;
-}
-
-static void mlxsw_linecard_device_update(struct mlxsw_linecard *linecard,
-					 u8 device_index,
-					 struct mlxsw_linecard_device_info *info)
-{
-	struct mlxsw_linecard_device *device;
-
-	device = mlxsw_linecard_device_lookup(linecard, device_index);
-	if (!device)
-		return;
-	device->info = *info;
-}
-
-static int mlxsw_linecard_devices_update(struct mlxsw_linecard *linecard)
-{
-	struct mlxsw_core *mlxsw_core = linecard->linecards->mlxsw_core;
-	u8 msg_seq = 0;
-
-	do {
-		struct mlxsw_linecard_device_info info;
-		char mddq_pl[MLXSW_REG_MDDQ_LEN];
-		bool data_valid;
-		u8 device_index;
-		int err;
-
-		mlxsw_reg_mddq_device_info_pack(mddq_pl, linecard->slot_index,
-						msg_seq);
-		err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mddq), mddq_pl);
-		if (err)
-			return err;
-		mlxsw_reg_mddq_device_info_unpack(mddq_pl, &msg_seq,
-						  &data_valid, NULL,
-						  &device_index,
-						  &info.fw_major,
-						  &info.fw_minor,
-						  &info.fw_sub_minor);
-		if (!data_valid)
-			break;
-		mlxsw_linecard_device_update(linecard, device_index, &info);
-	} while (msg_seq);
-
-	return 0;
-}
-
-static int
-mlxsw_linecard_device_info_get(struct devlink_linecard_device *devlink_linecard_device,
-			       void *priv, struct devlink_info_req *req,
-			       struct netlink_ext_ack *extack)
-{
-	struct mlxsw_linecard_device *device = priv;
-	struct mlxsw_linecard_device_info *info;
-	struct mlxsw_linecard *linecard;
-	char buf[32];
-
-	linecard = device->linecard;
-	mutex_lock(&linecard->lock);
-	if (!linecard->active) {
-		mutex_unlock(&linecard->lock);
-		return 0;
-	}
-
-	info = &device->info;
-
-	sprintf(buf, "%u.%u.%u", info->fw_major, info->fw_minor,
-		info->fw_sub_minor);
-	mutex_unlock(&linecard->lock);
-
-	return devlink_info_version_running_put(req,
-						DEVLINK_INFO_VERSION_GENERIC_FW,
-						buf);
-}
-
 static void mlxsw_linecard_provision_fail(struct mlxsw_linecard *linecard)
 {
 	linecard->provisioned = false;
 	linecard->ready = false;
 	linecard->active = false;
-	mlxsw_linecard_devices_detach(linecard);
 	devlink_linecard_provision_fail(linecard->devlink_linecard);
 }
 
@@ -412,7 +232,6 @@ mlxsw_linecard_provision_set(struct mlxsw_linecard *linecard, u8 card_type,
 {
 	struct mlxsw_linecards *linecards = linecard->linecards;
 	const char *type;
-	int err;
 
 	type = mlxsw_linecard_types_lookup(linecards, card_type);
 	mlxsw_linecard_status_event_done(linecard,
@@ -430,11 +249,6 @@ mlxsw_linecard_provision_set(struct mlxsw_linecard *linecard, u8 card_type,
 			return PTR_ERR(type);
 		}
 	}
-	err = mlxsw_linecard_devices_attach(linecard);
-	if (err) {
-		mlxsw_linecard_provision_fail(linecard);
-		return err;
-	}
 	linecard->provisioned = true;
 	linecard->hw_revision = hw_revision;
 	linecard->ini_version = ini_version;
@@ -447,7 +261,6 @@ static void mlxsw_linecard_provision_clear(struct mlxsw_linecard *linecard)
 	mlxsw_linecard_status_event_done(linecard,
 					 MLXSW_LINECARD_STATUS_EVENT_TYPE_UNPROVISION);
 	linecard->provisioned = false;
-	mlxsw_linecard_devices_detach(linecard);
 	devlink_linecard_provision_clear(linecard->devlink_linecard);
 }
 
@@ -479,18 +292,11 @@ static int mlxsw_linecard_ready_clear(struct mlxsw_linecard *linecard)
 	return 0;
 }
 
-static int mlxsw_linecard_active_set(struct mlxsw_linecard *linecard)
+static void mlxsw_linecard_active_set(struct mlxsw_linecard *linecard)
 {
-	int err;
-
-	err = mlxsw_linecard_devices_update(linecard);
-	if (err)
-		return err;
-
 	mlxsw_linecard_active_ops_call(linecard);
 	linecard->active = true;
 	devlink_linecard_activate(linecard->devlink_linecard);
-	return 0;
 }
 
 static void mlxsw_linecard_active_clear(struct mlxsw_linecard *linecard)
@@ -539,11 +345,8 @@ static int mlxsw_linecard_status_process(struct mlxsw_linecards *linecards,
 			goto out;
 	}
 
-	if (active && linecard->active != active) {
-		err = mlxsw_linecard_active_set(linecard);
-		if (err)
-			goto out;
-	}
+	if (active && linecard->active != active)
+		mlxsw_linecard_active_set(linecard);
 
 	if (!active && linecard->active != active)
 		mlxsw_linecard_active_clear(linecard);
@@ -934,44 +737,12 @@ static void mlxsw_linecard_types_get(struct devlink_linecard *devlink_linecard,
 	*type_priv = ini_file;
 }
 
-static int
-mlxsw_linecard_info_get(struct devlink_linecard *devlink_linecard, void *priv,
-			struct devlink_info_req *req,
-			struct netlink_ext_ack *extack)
-{
-	struct mlxsw_linecard *linecard = priv;
-	char buf[32];
-	int err;
-
-	mutex_lock(&linecard->lock);
-	if (!linecard->provisioned) {
-		err = 0;
-		goto unlock;
-	}
-
-	sprintf(buf, "%d", linecard->hw_revision);
-	err = devlink_info_version_fixed_put(req, "hw.revision", buf);
-	if (err)
-		goto unlock;
-
-	sprintf(buf, "%d", linecard->ini_version);
-	err = devlink_info_version_running_put(req, "ini.version", buf);
-	if (err)
-		goto unlock;
-
-unlock:
-	mutex_unlock(&linecard->lock);
-	return err;
-}
-
 static const struct devlink_linecard_ops mlxsw_linecard_ops = {
 	.provision = mlxsw_linecard_provision,
 	.unprovision = mlxsw_linecard_unprovision,
 	.same_provision = mlxsw_linecard_same_provision,
 	.types_count = mlxsw_linecard_types_count,
 	.types_get = mlxsw_linecard_types_get,
-	.info_get = mlxsw_linecard_info_get,
-	.device_info_get = mlxsw_linecard_device_info_get,
 };
 
 struct mlxsw_linecard_status_event {
@@ -1069,7 +840,6 @@ static int mlxsw_linecard_init(struct mlxsw_core *mlxsw_core,
 	linecard->slot_index = slot_index;
 	linecard->linecards = linecards;
 	mutex_init(&linecard->lock);
-	INIT_LIST_HEAD(&linecard->device_list);
 
 	devlink_linecard = devlink_linecard_create(priv_to_devlink(mlxsw_core),
 						   slot_index, &mlxsw_linecard_ops,
@@ -1115,7 +885,6 @@ static void mlxsw_linecard_fini(struct mlxsw_core *mlxsw_core,
 	mlxsw_core_flush_owq();
 	if (linecard->active)
 		mlxsw_linecard_active_clear(linecard);
-	mlxsw_linecard_devices_detach(linecard);
 	devlink_linecard_destroy(linecard->devlink_linecard);
 	mutex_destroy(&linecard->lock);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 078e3aa04383..93af6c974ece 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -11643,11 +11643,7 @@ MLXSW_ITEM32(reg, mddq, sie, 0x00, 31, 1);
 
 enum mlxsw_reg_mddq_query_type {
 	MLXSW_REG_MDDQ_QUERY_TYPE_SLOT_INFO = 1,
-	MLXSW_REG_MDDQ_QUERY_TYPE_DEVICE_INFO, /* If there are no devices
-						* on the slot, data_valid
-						* will be '0'.
-						*/
-	MLXSW_REG_MDDQ_QUERY_TYPE_SLOT_NAME,
+	MLXSW_REG_MDDQ_QUERY_TYPE_SLOT_NAME = 3,
 };
 
 /* reg_mddq_query_type
@@ -11661,28 +11657,6 @@ MLXSW_ITEM32(reg, mddq, query_type, 0x00, 16, 8);
  */
 MLXSW_ITEM32(reg, mddq, slot_index, 0x00, 0, 4);
 
-/* reg_mddq_response_msg_seq
- * Response message sequential number. For a specific request, the response
- * message sequential number is the following one. In addition, the last
- * message should be 0.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, response_msg_seq, 0x04, 16, 8);
-
-/* reg_mddq_request_msg_seq
- * Request message sequential number.
- * The first message number should be 0.
- * Access: Index
- */
-MLXSW_ITEM32(reg, mddq, request_msg_seq, 0x04, 0, 8);
-
-/* reg_mddq_data_valid
- * If set, the data in the data field is valid and contain the information
- * for the queried index.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, data_valid, 0x08, 31, 1);
-
 /* reg_mddq_slot_info_provisioned
  * If set, the INI file is applied and the card is provisioned.
  * Access: RO
@@ -11769,65 +11743,6 @@ mlxsw_reg_mddq_slot_info_unpack(const char *payload, u8 *p_slot_index,
 	*p_card_type = mlxsw_reg_mddq_slot_info_card_type_get(payload);
 }
 
-/* reg_mddq_device_info_flash_owner
- * If set, the device is the flash owner. Otherwise, a shared flash
- * is used by this device (another device is the flash owner).
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, device_info_flash_owner, 0x10, 30, 1);
-
-/* reg_mddq_device_info_device_index
- * Device index. The first device should number 0.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, device_info_device_index, 0x10, 0, 8);
-
-/* reg_mddq_device_info_fw_major
- * Major FW version number.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, device_info_fw_major, 0x14, 16, 16);
-
-/* reg_mddq_device_info_fw_minor
- * Minor FW version number.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, device_info_fw_minor, 0x18, 16, 16);
-
-/* reg_mddq_device_info_fw_sub_minor
- * Sub-minor FW version number.
- * Access: RO
- */
-MLXSW_ITEM32(reg, mddq, device_info_fw_sub_minor, 0x18, 0, 16);
-
-static inline void
-mlxsw_reg_mddq_device_info_pack(char *payload, u8 slot_index,
-				u8 request_msg_seq)
-{
-	__mlxsw_reg_mddq_pack(payload, slot_index,
-			      MLXSW_REG_MDDQ_QUERY_TYPE_DEVICE_INFO);
-	mlxsw_reg_mddq_request_msg_seq_set(payload, request_msg_seq);
-}
-
-static inline void
-mlxsw_reg_mddq_device_info_unpack(const char *payload, u8 *p_response_msg_seq,
-				  bool *p_data_valid, bool *p_flash_owner,
-				  u8 *p_device_index, u16 *p_fw_major,
-				  u16 *p_fw_minor, u16 *p_fw_sub_minor)
-{
-	*p_response_msg_seq = mlxsw_reg_mddq_response_msg_seq_get(payload);
-	*p_data_valid = mlxsw_reg_mddq_data_valid_get(payload);
-	if (p_flash_owner)
-		*p_flash_owner = mlxsw_reg_mddq_device_info_flash_owner_get(payload);
-	*p_device_index = mlxsw_reg_mddq_device_info_device_index_get(payload);
-	if (p_fw_major)
-		*p_fw_major = mlxsw_reg_mddq_device_info_fw_major_get(payload);
-	if (p_fw_minor)
-		*p_fw_minor = mlxsw_reg_mddq_device_info_fw_minor_get(payload);
-	if (p_fw_sub_minor)
-		*p_fw_sub_minor = mlxsw_reg_mddq_device_info_fw_sub_minor_get(payload);
-}
-
 #define MLXSW_REG_MDDQ_SLOT_ASCII_NAME_LEN 20
 
 /* reg_mddq_slot_ascii_name
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 062895973656..2a2a2a0c93f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -150,9 +150,6 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
-struct devlink_info_req;
-struct devlink_linecard_device;
-
 /**
  * struct devlink_linecard_ops - Linecard operations
  * @provision: callback to provision the linecard slot with certain
@@ -171,8 +168,6 @@ struct devlink_linecard_device;
  *                  provisioned.
  * @types_count: callback to get number of supported types
  * @types_get: callback to get next type in list
- * @info_get: callback to get linecard info
- * @device_info_get: callback to get linecard device info
  */
 struct devlink_linecard_ops {
 	int (*provision)(struct devlink_linecard *linecard, void *priv,
@@ -187,12 +182,6 @@ struct devlink_linecard_ops {
 	void (*types_get)(struct devlink_linecard *linecard,
 			  void *priv, unsigned int index, const char **type,
 			  const void **type_priv);
-	int (*info_get)(struct devlink_linecard *linecard, void *priv,
-			struct devlink_info_req *req,
-			struct netlink_ext_ack *extack);
-	int (*device_info_get)(struct devlink_linecard_device *device,
-			       void *priv, struct devlink_info_req *req,
-			       struct netlink_ext_ack *extack);
 };
 
 struct devlink_sb_pool_info {
@@ -639,6 +628,7 @@ struct devlink_flash_update_params {
 #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK	BIT(1)
 
 struct devlink_region;
+struct devlink_info_req;
 
 /**
  * struct devlink_region_ops - Region operations
@@ -1588,12 +1578,6 @@ struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
-struct devlink_linecard_device *
-devlink_linecard_device_create(struct devlink_linecard *linecard,
-			       unsigned int device_index, void *priv);
-void
-devlink_linecard_device_destroy(struct devlink_linecard *linecard,
-				struct devlink_linecard_device *linecard_device);
 void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index fb8c3864457f..b3d40a5d72ff 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -136,8 +136,6 @@ enum devlink_command {
 	DEVLINK_CMD_LINECARD_NEW,
 	DEVLINK_CMD_LINECARD_DEL,
 
-	DEVLINK_CMD_LINECARD_INFO_GET,		/* can dump */
-
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -577,9 +575,6 @@ enum devlink_attr {
 	DEVLINK_ATTR_LINECARD_STATE,		/* u8 */
 	DEVLINK_ATTR_LINECARD_TYPE,		/* string */
 	DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES,	/* nested */
-	DEVLINK_ATTR_LINECARD_DEVICE_LIST,	/* nested */
-	DEVLINK_ATTR_LINECARD_DEVICE,		/* nested */
-	DEVLINK_ATTR_LINECARD_DEVICE_INDEX,	/* u32 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 5f441a0e34f4..5cc88490f18f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -83,11 +83,10 @@ struct devlink_linecard {
 	const struct devlink_linecard_ops *ops;
 	void *priv;
 	enum devlink_linecard_state state;
-	struct mutex state_lock; /* Protects state and device_list */
+	struct mutex state_lock; /* Protects state */
 	const char *type;
 	struct devlink_linecard_type *types;
 	unsigned int types_count;
-	struct list_head device_list;
 };
 
 /**
@@ -2059,56 +2058,6 @@ struct devlink_linecard_type {
 	const void *priv;
 };
 
-struct devlink_linecard_device {
-	struct list_head list;
-	unsigned int index;
-	void *priv;
-};
-
-static int
-devlink_nl_linecard_device_fill(struct sk_buff *msg,
-				struct devlink_linecard_device *linecard_device)
-{
-	struct nlattr *attr;
-
-	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE);
-	if (!attr)
-		return -EMSGSIZE;
-	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX,
-			linecard_device->index)) {
-		nla_nest_cancel(msg, attr);
-		return -EMSGSIZE;
-	}
-	nla_nest_end(msg, attr);
-
-	return 0;
-}
-
-static int devlink_nl_linecard_devices_fill(struct sk_buff *msg,
-					    struct devlink_linecard *linecard)
-{
-	struct devlink_linecard_device *linecard_device;
-	struct nlattr *attr;
-	int err;
-
-	if (list_empty(&linecard->device_list))
-		return 0;
-
-	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST);
-	if (!attr)
-		return -EMSGSIZE;
-	list_for_each_entry(linecard_device, &linecard->device_list, list) {
-		err = devlink_nl_linecard_device_fill(msg, linecard_device);
-		if (err) {
-			nla_nest_cancel(msg, attr);
-			return err;
-		}
-	}
-	nla_nest_end(msg, attr);
-
-	return 0;
-}
-
 static int devlink_nl_linecard_fill(struct sk_buff *msg,
 				    struct devlink *devlink,
 				    struct devlink_linecard *linecard,
@@ -2119,7 +2068,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 	struct devlink_linecard_type *linecard_type;
 	struct nlattr *attr;
 	void *hdr;
-	int err;
 	int i;
 
 	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -2152,10 +2100,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		nla_nest_end(msg, attr);
 	}
 
-	err = devlink_nl_linecard_devices_fill(msg, linecard);
-	if (err)
-		goto nla_put_failure;
-
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -2425,191 +2369,6 @@ static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
-struct devlink_info_req {
-	struct sk_buff *msg;
-};
-
-static int
-devlink_nl_linecard_device_info_fill(struct sk_buff *msg,
-				     struct devlink_linecard *linecard,
-				     struct devlink_linecard_device *linecard_device,
-				     struct netlink_ext_ack *extack)
-{
-	struct nlattr *attr;
-
-	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE);
-	if (!attr)
-		return -EMSGSIZE;
-	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX,
-			linecard_device->index)) {
-		nla_nest_cancel(msg, attr);
-		return -EMSGSIZE;
-	}
-	if (linecard->ops->device_info_get) {
-		struct devlink_info_req req;
-		int err;
-
-		req.msg = msg;
-		err = linecard->ops->device_info_get(linecard_device,
-						     linecard_device->priv,
-						     &req, extack);
-		if (err) {
-			nla_nest_cancel(msg, attr);
-			return err;
-		}
-	}
-	nla_nest_end(msg, attr);
-
-	return 0;
-}
-
-static int devlink_nl_linecard_devices_info_fill(struct sk_buff *msg,
-						 struct devlink_linecard *linecard,
-						 struct netlink_ext_ack *extack)
-{
-	struct devlink_linecard_device *linecard_device;
-	struct nlattr *attr;
-	int err;
-
-	if (list_empty(&linecard->device_list))
-		return 0;
-
-	attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST);
-	if (!attr)
-		return -EMSGSIZE;
-	list_for_each_entry(linecard_device, &linecard->device_list, list) {
-		err = devlink_nl_linecard_device_info_fill(msg, linecard,
-							   linecard_device,
-							   extack);
-		if (err) {
-			nla_nest_cancel(msg, attr);
-			return err;
-		}
-	}
-	nla_nest_end(msg, attr);
-
-	return 0;
-}
-
-static int
-devlink_nl_linecard_info_fill(struct sk_buff *msg, struct devlink *devlink,
-			      struct devlink_linecard *linecard,
-			      enum devlink_command cmd, u32 portid,
-			      u32 seq, int flags, struct netlink_ext_ack *extack)
-{
-	struct devlink_info_req req;
-	void *hdr;
-	int err;
-
-	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
-	if (!hdr)
-		return -EMSGSIZE;
-
-	err = -EMSGSIZE;
-	if (devlink_nl_put_handle(msg, devlink))
-		goto nla_put_failure;
-	if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
-		goto nla_put_failure;
-
-	req.msg = msg;
-	err = linecard->ops->info_get(linecard, linecard->priv, &req, extack);
-	if (err)
-		goto nla_put_failure;
-
-	err = devlink_nl_linecard_devices_info_fill(msg, linecard, extack);
-	if (err)
-		goto nla_put_failure;
-
-	genlmsg_end(msg, hdr);
-	return 0;
-
-nla_put_failure:
-	genlmsg_cancel(msg, hdr);
-	return err;
-}
-
-static int devlink_nl_cmd_linecard_info_get_doit(struct sk_buff *skb,
-						 struct genl_info *info)
-{
-	struct devlink_linecard *linecard = info->user_ptr[1];
-	struct devlink *devlink = linecard->devlink;
-	struct sk_buff *msg;
-	int err;
-
-	if (!linecard->ops->info_get)
-		return -EOPNOTSUPP;
-
-	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
-
-	mutex_lock(&linecard->state_lock);
-	err = devlink_nl_linecard_info_fill(msg, devlink, linecard,
-					    DEVLINK_CMD_LINECARD_INFO_GET,
-					    info->snd_portid, info->snd_seq, 0,
-					    info->extack);
-	mutex_unlock(&linecard->state_lock);
-	if (err) {
-		nlmsg_free(msg);
-		return err;
-	}
-
-	return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_linecard_info_get_dumpit(struct sk_buff *msg,
-						   struct netlink_callback *cb)
-{
-	struct devlink_linecard *linecard;
-	struct devlink *devlink;
-	int start = cb->args[0];
-	unsigned long index;
-	int idx = 0;
-	int err = 0;
-
-	mutex_lock(&devlink_mutex);
-	xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
-		if (!devlink_try_get(devlink))
-			continue;
-
-		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
-			goto retry;
-
-		mutex_lock(&devlink->linecards_lock);
-		list_for_each_entry(linecard, &devlink->linecard_list, list) {
-			if (idx < start || !linecard->ops->info_get) {
-				idx++;
-				continue;
-			}
-			mutex_lock(&linecard->state_lock);
-			err = devlink_nl_linecard_info_fill(msg, devlink, linecard,
-							    DEVLINK_CMD_LINECARD_INFO_GET,
-							    NETLINK_CB(cb->skb).portid,
-							    cb->nlh->nlmsg_seq,
-							    NLM_F_MULTI,
-							    cb->extack);
-			mutex_unlock(&linecard->state_lock);
-			if (err) {
-				mutex_unlock(&devlink->linecards_lock);
-				devlink_put(devlink);
-				goto out;
-			}
-			idx++;
-		}
-		mutex_unlock(&devlink->linecards_lock);
-retry:
-		devlink_put(devlink);
-	}
-out:
-	mutex_unlock(&devlink_mutex);
-
-	if (err != -EMSGSIZE)
-		return err;
-
-	cb->args[0] = idx;
-	return msg->len;
-}
-
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
 			      struct devlink_sb *devlink_sb,
 			      enum devlink_command cmd, u32 portid,
@@ -6602,6 +6361,10 @@ out_dev:
 	return err;
 }
 
+struct devlink_info_req {
+	struct sk_buff *msg;
+};
+
 int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
 {
 	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
@@ -9321,13 +9084,6 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
 	},
-	{
-		.cmd = DEVLINK_CMD_LINECARD_INFO_GET,
-		.doit = devlink_nl_cmd_linecard_info_get_doit,
-		.dumpit = devlink_nl_cmd_linecard_info_get_dumpit,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
-		/* can be retrieved by unprivileged users */
-	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -10508,7 +10264,6 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 	linecard->priv = priv;
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	mutex_init(&linecard->state_lock);
-	INIT_LIST_HEAD(&linecard->device_list);
 
 	err = devlink_linecard_types_init(linecard);
 	if (err) {
@@ -10536,7 +10291,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard)
 	struct devlink *devlink = linecard->devlink;
 
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
-	WARN_ON(!list_empty(&linecard->device_list));
 	mutex_lock(&devlink->linecards_lock);
 	list_del(&linecard->list);
 	devlink_linecard_types_fini(linecard);
@@ -10545,52 +10299,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_destroy);
 
-/**
- *	devlink_linecard_device_create - Create a device on linecard
- *
- *	@linecard: devlink linecard
- *	@device_index: index of the linecard device
- *	@priv: user priv pointer
- *
- *	Return: Line card device structure or an ERR_PTR() encoded error code.
- */
-struct devlink_linecard_device *
-devlink_linecard_device_create(struct devlink_linecard *linecard,
-			       unsigned int device_index, void *priv)
-{
-	struct devlink_linecard_device *linecard_device;
-
-	linecard_device = kzalloc(sizeof(*linecard_device), GFP_KERNEL);
-	if (!linecard_device)
-		return ERR_PTR(-ENOMEM);
-	linecard_device->index = device_index;
-	linecard_device->priv = priv;
-	mutex_lock(&linecard->state_lock);
-	list_add_tail(&linecard_device->list, &linecard->device_list);
-	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
-	mutex_unlock(&linecard->state_lock);
-	return linecard_device;
-}
-EXPORT_SYMBOL_GPL(devlink_linecard_device_create);
-
-/**
- *	devlink_linecard_device_destroy - Destroy device on linecard
- *
- *	@linecard: devlink linecard
- *	@linecard_device: devlink linecard device
- */
-void
-devlink_linecard_device_destroy(struct devlink_linecard *linecard,
-				struct devlink_linecard_device *linecard_device)
-{
-	mutex_lock(&linecard->state_lock);
-	list_del(&linecard_device->list);
-	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
-	mutex_unlock(&linecard->state_lock);
-	kfree(linecard_device);
-}
-EXPORT_SYMBOL_GPL(devlink_linecard_device_destroy);
-
 /**
  *	devlink_linecard_provision_set - Set provisioning on linecard
  *
@@ -10623,7 +10331,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
-	WARN_ON(!list_empty(&linecard->device_list));
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	linecard->type = NULL;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_linecard.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_linecard.sh
index 53a65f416770..08a922d8b86a 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_linecard.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_linecard.sh
@@ -152,7 +152,6 @@ unprovision_test()
 
 LC_16X100G_TYPE="16x100G"
 LC_16X100G_PORT_COUNT=16
-LC_16X100G_DEVICE_COUNT=4
 
 supported_types_check()
 {
@@ -178,42 +177,6 @@ supported_types_check()
 	check_err $? "16X100G not found between supported types of linecard $lc"
 }
 
-lc_info_check()
-{
-	local lc=$1
-	local fixed_hw_revision
-	local running_ini_version
-
-	fixed_hw_revision=$(devlink lc -v info $DEVLINK_DEV lc $lc -j | \
-			    jq -e -r '.[][][].versions.fixed."hw.revision"')
-	check_err $? "Failed to get linecard $lc fixed.hw.revision"
-	log_info "Linecard $lc fixed.hw.revision: \"$fixed_hw_revision\""
-	running_ini_version=$(devlink lc -v info $DEVLINK_DEV lc $lc -j | \
-			      jq -e -r '.[][][].versions.running."ini.version"')
-	check_err $? "Failed to get linecard $lc running.ini.version"
-	log_info "Linecard $lc running.ini.version: \"$running_ini_version\""
-}
-
-lc_devices_check()
-{
-	local lc=$1
-	local expected_device_count=$2
-	local device_count
-	local device
-
-	device_count=$(devlink lc show $DEVLINK_DEV lc $lc -j | \
-		       jq -e -r ".[][][].devices |length")
-	check_err $? "Failed to get linecard $lc device count"
-	[ $device_count != 0 ]
-	check_err $? "No device found on linecard $lc"
-	[ $device_count == $expected_device_count ]
-	check_err $? "Unexpected device count on linecard $lc (got $expected_device_count, expected $device_count)"
-	for (( device=0; device<device_count; device++ ))
-	do
-		log_info "Linecard $lc device $device"
-	done
-}
-
 ports_check()
 {
 	local lc=$1
@@ -243,8 +206,6 @@ provision_test()
 		unprovision_one $lc
 	fi
 	provision_one $lc $LC_16X100G_TYPE
-	lc_devices_check $lc $LC_16X100G_DEVICE_COUNT
-	lc_info_check $lc
 	ports_check $lc $LC_16X100G_PORT_COUNT
 	log_test "Provision"
 }
@@ -259,26 +220,6 @@ interface_check()
 	setup_wait
 }
 
-lc_devices_info_check()
-{
-	local lc=$1
-	local expected_device_count=$2
-	local device_count
-	local device
-	local running_device_fw
-
-	device_count=$(devlink lc info $DEVLINK_DEV lc $lc -j | \
-		       jq -e -r ".[][][].devices |length")
-	check_err $? "Failed to get linecard $lc device count"
-	for (( device=0; device<device_count; device++ ))
-	do
-		running_device_fw=$(devlink lc -v info $DEVLINK_DEV lc $lc -j | \
-				    jq -e -r ".[][][].devices[$device].versions.running.fw")
-		check_err $? "Failed to get linecard $lc device $device running fw version"
-		log_info "Linecard $lc device $device running.fw: \"$running_device_fw\""
-	done
-}
-
 activation_16x100G_test()
 {
 	RET=0
@@ -295,8 +236,6 @@ activation_16x100G_test()
 		$ACTIVATION_TIMEOUT)
 	check_err $? "Failed to get linecard $lc activated (timeout)"
 
-	lc_devices_info_check $lc $LC_16X100G_DEVICE_COUNT
-
 	interface_check
 
 	log_test "Activation 16x100G"
-- 
cgit v1.2.3


From 0455d4ccec548b0fb51db39a4d3350a7a80a0222 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Apr 2022 12:11:33 -0600
Subject: io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg

If IORING_RECVSEND_POLL_FIRST is set for recv/recvmsg or send/sendmsg,
then we arm poll first rather than attempt a receive or send upfront.
This can be useful if we expect there to be no data (or space) available
for the request, as we can then avoid wasting time on the initial
issue attempt.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 27 +++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h | 10 ++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index efe4e92ad8ad..6db9ab8d4d15 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -637,6 +637,7 @@ struct io_sr_msg {
 	int				bgid;
 	size_t				len;
 	size_t				done_io;
+	unsigned int			flags;
 };
 
 struct io_open {
@@ -5272,11 +5273,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->addr2 || sqe->file_index))
+	if (unlikely(sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->addr2);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
@@ -5311,6 +5315,10 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
 	flags = req->sr_msg.msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
@@ -5353,6 +5361,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	int min_ret = 0;
 	int ret;
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
@@ -5505,11 +5517,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->addr2 || sqe->file_index))
+	if (unlikely(sqe->file_index))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
+	sr->flags = READ_ONCE(sqe->addr2);
+	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
 	sr->bgid = READ_ONCE(sqe->buf_group);
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
@@ -5546,6 +5561,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		kmsg = &iomsg;
 	}
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg);
+
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		kbuf = io_recv_buffer_select(req, issue_flags);
 		if (IS_ERR(kbuf))
@@ -5603,6 +5622,10 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index fad63564678a..06621a278cb6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -213,6 +213,16 @@ enum {
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 
+/*
+ * send/sendmsg and recv/recvmsg flags (sqe->addr2)
+ *
+ * IORING_RECVSEND_POLL_FIRST	If set, instead of first attempting to send
+ *				or receive and arm poll if that yields an
+ *				-EAGAIN result, arm poll upfront and skip
+ *				the initial transfer attempt.
+ */
+#define IORING_RECVSEND_POLL_FIRST	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
-- 
cgit v1.2.3


From 6cc2df8e3a3967e7c13a424f87f6efb1d4a62d80 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 6 May 2022 18:05:07 +0200
Subject: landlock: Add clang-format exceptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation to a following commit, add clang-format on and
clang-format off stanzas around constant definitions.  This enables to
keep aligned values, which is much more readable than packed
definitions.

Link: https://lore.kernel.org/r/20220506160513.523257-2-mic@digikod.net
Cc: stable@vger.kernel.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 4 ++++
 security/landlock/fs.c        | 2 ++
 security/landlock/limits.h    | 4 ++++
 3 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index b3d952067f59..15c31abb0d76 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -33,7 +33,9 @@ struct landlock_ruleset_attr {
  * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI
  *   version.
  */
+/* clang-format off */
 #define LANDLOCK_CREATE_RULESET_VERSION			(1U << 0)
+/* clang-format on */
 
 /**
  * enum landlock_rule_type - Landlock rule type
@@ -120,6 +122,7 @@ struct landlock_path_beneath_attr {
  *   :manpage:`access(2)`.
  *   Future Landlock evolutions will enable to restrict them.
  */
+/* clang-format off */
 #define LANDLOCK_ACCESS_FS_EXECUTE			(1ULL << 0)
 #define LANDLOCK_ACCESS_FS_WRITE_FILE			(1ULL << 1)
 #define LANDLOCK_ACCESS_FS_READ_FILE			(1ULL << 2)
@@ -133,5 +136,6 @@ struct landlock_path_beneath_attr {
 #define LANDLOCK_ACCESS_FS_MAKE_FIFO			(1ULL << 10)
 #define LANDLOCK_ACCESS_FS_MAKE_BLOCK			(1ULL << 11)
 #define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
+/* clang-format on */
 
 #endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 97b8e421f617..4195a6be60b2 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -141,10 +141,12 @@ retry:
 }
 
 /* All access rights that can be tied to files. */
+/* clang-format off */
 #define ACCESS_FILE ( \
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE)
+/* clang-format on */
 
 /*
  * @path: Should have been checked by get_path_from_fd().
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 2a0a1095ee27..a274ae6b5570 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -12,10 +12,14 @@
 #include <linux/limits.h>
 #include <uapi/linux/landlock.h>
 
+/* clang-format off */
+
 #define LANDLOCK_MAX_NUM_LAYERS		64
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
 #define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_MAKE_SYM
 #define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
 
+/* clang-format on */
+
 #endif /* _SECURITY_LANDLOCK_LIMITS_H */
-- 
cgit v1.2.3


From a36e07dfe6ee71e209383ea9288cd8d1617e14f9 Mon Sep 17 00:00:00 2001
From: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Date: Fri, 6 May 2022 17:24:54 +0000
Subject: rfkill: uapi: fix RFKILL_IOCTL_MAX_SIZE ioctl request definition

The definition of RFKILL_IOCTL_MAX_SIZE introduced by commit
54f586a91532 ("rfkill: make new event layout opt-in") is unusable
since it is based on RFKILL_IOC_EXT_SIZE which has not been defined.
Fix that by replacing the undefined constant with the constant which
is intended to be used in this definition.

Fixes: 54f586a91532 ("rfkill: make new event layout opt-in")
Cc: stable@vger.kernel.org # 5.11+
Signed-off-by: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Link: https://lore.kernel.org/r/20220506172454.120319-1-glebfm@altlinux.org
[add commit message provided later by Dmitry]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 283c5a7b3f2c..db6c8588c1d0 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -184,7 +184,7 @@ struct rfkill_event_ext {
 #define RFKILL_IOC_NOINPUT	1
 #define RFKILL_IOCTL_NOINPUT	_IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT)
 #define RFKILL_IOC_MAX_SIZE	2
-#define RFKILL_IOCTL_MAX_SIZE	_IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32)
+#define RFKILL_IOCTL_MAX_SIZE	_IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_MAX_SIZE, __u32)
 
 /* and that's all userspace gets */
 
-- 
cgit v1.2.3


From ebdeb7c01d025cb059f05dc26b9dc914e46dd43f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 31 Mar 2022 19:27:52 -0600
Subject: io_uring: add support for 128-byte SQEs

Normal SQEs are 64-bytes in length, which is fine for all the commands
we support. However, in preparation for supporting passthrough IO,
provide an option for setting up a ring with 128-byte SQEs.

We continue to use the same type for io_uring_sqe, it's marked and
commented with a zero sized array pad at the end. This provides up
to 80 bytes of data for a passthrough command - 64 bytes for the
extra added data, and 16 bytes available at the end of the existing
SQE.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 14 +++++++++++---
 include/uapi/linux/io_uring.h |  8 ++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 53e54fc05488..106a0db56ddb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8519,8 +8519,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	 *    though the application is the one updating it.
 	 */
 	head = READ_ONCE(ctx->sq_array[sq_idx]);
-	if (likely(head < ctx->sq_entries))
+	if (likely(head < ctx->sq_entries)) {
+		/* double index for 128-byte SQEs, twice as long */
+		if (ctx->flags & IORING_SETUP_SQE128)
+			head <<= 1;
 		return &ctx->sq_sqes[head];
+	}
 
 	/* drop invalid entries */
 	ctx->cq_extra--;
@@ -11689,7 +11693,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	rings->sq_ring_entries = p->sq_entries;
 	rings->cq_ring_entries = p->cq_entries;
 
-	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+	if (p->flags & IORING_SETUP_SQE128)
+		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
+	else
+		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
 	if (size == SIZE_MAX) {
 		io_mem_free(ctx->rings);
 		ctx->rings = NULL;
@@ -11933,7 +11940,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
-			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
+			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
+			IORING_SETUP_SQE128))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 31e719f38615..ee84132cadad 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -63,6 +63,12 @@ struct io_uring_sqe {
 	};
 	__u64	addr3;
 	__u64	__pad2[1];
+
+	/*
+	 * If the ring is initialized with IORING_SETUP_SQE128, then this field
+	 * contains 64-bytes of padding, doubling the size of the SQE.
+	 */
+	__u64	__big_sqe_pad[0];
 };
 
 enum {
@@ -119,6 +125,8 @@ enum {
  */
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 
+#define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
-- 
cgit v1.2.3


From 7a51e5b44b92686eebd3e1b46b86e1eb4db975db Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@fb.com>
Date: Tue, 26 Apr 2022 11:21:23 -0700
Subject: io_uring: support CQE32 in io_uring_cqe

This adds the big_cqe array to the struct io_uring_cqe to support large
CQE's.

Co-developed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Stefan Roesch <shr@fb.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220426182134.136504-2-shr@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ee84132cadad..ac2d90d669c3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -126,6 +126,7 @@ enum {
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 
 #define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
 
 enum io_uring_op {
 	IORING_OP_NOP,
@@ -245,6 +246,12 @@ struct io_uring_cqe {
 	__u64	user_data;	/* sqe->data submission passed back */
 	__s32	res;		/* result code for this event */
 	__u32	flags;
+
+	/*
+	 * If the ring is initialized with IORING_SETUP_CQE32, then this field
+	 * contains 16-bytes of padding, doubling the size of the CQE.
+	 */
+	__u64 big_cqe[];
 };
 
 /*
-- 
cgit v1.2.3


From bd32889e841c12533d09a1bd02bba932baa9ed8f Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 29 Apr 2022 23:56:41 +0000
Subject: binder: add BINDER_GET_EXTENDED_ERROR ioctl

Provide a userspace mechanism to pull precise error information upon
failed operations. Extending the current error codes returned by the
interfaces allows userspace to better determine the course of action.
This could be for instance, retrying a failed transaction at a later
point and thus offloading the error handling from the driver.

Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20220429235644.697372-3-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 60 +++++++++++++++++++++++++++++++++++++
 drivers/android/binder_internal.h   |  3 ++
 include/uapi/linux/android/binder.h | 16 ++++++++++
 3 files changed, 79 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index f0690d46caa1..4c2caf38e056 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -147,6 +147,13 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
 			binder_stop_on_user_error = 2; \
 	} while (0)
 
+#define binder_set_extended_error(ee, _id, _command, _param) \
+	do { \
+		(ee)->id = _id; \
+		(ee)->command = _command; \
+		(ee)->param = _param; \
+	} while (0)
+
 #define to_flat_binder_object(hdr) \
 	container_of(hdr, struct flat_binder_object, hdr)
 
@@ -2708,6 +2715,24 @@ static struct binder_node *binder_get_node_refs_for_txn(
 	return target_node;
 }
 
+static void binder_set_txn_from_error(struct binder_transaction *t, int id,
+				      uint32_t command, int32_t param)
+{
+	struct binder_thread *from = binder_get_txn_from_and_acq_inner(t);
+
+	if (!from) {
+		/* annotation for sparse */
+		__release(&from->proc->inner_lock);
+		return;
+	}
+
+	/* don't override existing errors */
+	if (from->ee.command == BR_OK)
+		binder_set_extended_error(&from->ee, id, command, param);
+	binder_inner_proc_unlock(from->proc);
+	binder_thread_dec_tmpref(from);
+}
+
 static void binder_transaction(struct binder_proc *proc,
 			       struct binder_thread *thread,
 			       struct binder_transaction_data *tr, int reply,
@@ -2753,6 +2778,10 @@ static void binder_transaction(struct binder_proc *proc,
 	e->offsets_size = tr->offsets_size;
 	strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME);
 
+	binder_inner_proc_lock(proc);
+	binder_set_extended_error(&thread->ee, t_debug_id, BR_OK, 0);
+	binder_inner_proc_unlock(proc);
+
 	if (reply) {
 		binder_inner_proc_lock(proc);
 		in_reply_to = thread->transaction_stack;
@@ -3498,10 +3527,16 @@ err_invalid_target_handle:
 
 	BUG_ON(thread->return_error.cmd != BR_OK);
 	if (in_reply_to) {
+		binder_set_txn_from_error(in_reply_to, t_debug_id,
+				return_error, return_error_param);
 		thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
 		binder_enqueue_thread_work(thread, &thread->return_error.work);
 		binder_send_failed_reply(in_reply_to, return_error);
 	} else {
+		binder_inner_proc_lock(proc);
+		binder_set_extended_error(&thread->ee, t_debug_id,
+				return_error, return_error_param);
+		binder_inner_proc_unlock(proc);
 		thread->return_error.cmd = return_error;
 		binder_enqueue_thread_work(thread, &thread->return_error.work);
 	}
@@ -4628,6 +4663,7 @@ static struct binder_thread *binder_get_thread_ilocked(
 	thread->return_error.cmd = BR_OK;
 	thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
 	thread->reply_error.cmd = BR_OK;
+	thread->ee.command = BR_OK;
 	INIT_LIST_HEAD(&new_thread->waiting_thread_node);
 	return thread;
 }
@@ -5066,6 +5102,25 @@ static int binder_ioctl_get_freezer_info(
 	return 0;
 }
 
+static int binder_ioctl_get_extended_error(struct binder_thread *thread,
+					   void __user *ubuf)
+{
+	struct binder_extended_error *ee = &thread->ee;
+
+	binder_inner_proc_lock(thread->proc);
+	if (copy_to_user(ubuf, ee, sizeof(*ee))) {
+		binder_inner_proc_unlock(thread->proc);
+		return -EFAULT;
+	}
+
+	ee->id = 0;
+	ee->command = BR_OK;
+	ee->param = 0;
+	binder_inner_proc_unlock(thread->proc);
+
+	return 0;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -5274,6 +5329,11 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		binder_inner_proc_unlock(proc);
 		break;
 	}
+	case BINDER_GET_EXTENDED_ERROR:
+		ret = binder_ioctl_get_extended_error(thread, ubuf);
+		if (ret < 0)
+			goto err;
+		break;
 	default:
 		ret = -EINVAL;
 		goto err;
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index cf70a104594d..8dc0bccf8513 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -480,6 +480,8 @@ struct binder_proc {
  *                        (only accessed by this thread)
  * @reply_error:          transaction errors reported by target thread
  *                        (protected by @proc->inner_lock)
+ * @ee:                   extended error information from this thread
+ *                        (protected by @proc->inner_lock)
  * @wait:                 wait queue for thread work
  * @stats:                per-thread statistics
  *                        (atomics, no lock needed)
@@ -504,6 +506,7 @@ struct binder_thread {
 	bool process_todo;
 	struct binder_error return_error;
 	struct binder_error reply_error;
+	struct binder_extended_error ee;
 	wait_queue_head_t wait;
 	struct binder_stats stats;
 	atomic_t tmp_ref;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 11157fae8a8e..e6ee8cae303b 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -236,6 +236,21 @@ struct binder_frozen_status_info {
 	__u32            async_recv;
 };
 
+/* struct binder_extened_error - extended error information
+ * @id:		identifier for the failed operation
+ * @command:	command as defined by binder_driver_return_protocol
+ * @param:	parameter holding a negative errno value
+ *
+ * Used with BINDER_GET_EXTENDED_ERROR. This extends the error information
+ * returned by the driver upon a failed operation. Userspace can pull this
+ * data to properly handle specific error scenarios.
+ */
+struct binder_extended_error {
+	__u32	id;
+	__u32	command;
+	__s32	param;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -249,6 +264,7 @@ struct binder_frozen_status_info {
 #define BINDER_FREEZE			_IOW('b', 14, struct binder_freeze_info)
 #define BINDER_GET_FROZEN_INFO		_IOWR('b', 15, struct binder_frozen_status_info)
 #define BINDER_ENABLE_ONEWAY_SPAM_DETECTION	_IOW('b', 16, __u32)
+#define BINDER_GET_EXTENDED_ERROR	_IOWR('b', 17, struct binder_extended_error)
 
 /*
  * NOTE: Two special error codes you should check for when calling
-- 
cgit v1.2.3


From 7ff960a6fe399fdcbca6159063684671ae57eee9 Mon Sep 17 00:00:00 2001
From: Shunsuke Mie <mie@igel.co.jp>
Date: Tue, 10 May 2022 19:27:23 +0900
Subject: virtio: fix virtio transitional ids

This commit fixes the transitional PCI device ID.

Fixes: d61914ea6ada ("virtio: update virtio id table, add transitional ids")
Signed-off-by: Shunsuke Mie <mie@igel.co.jp>
Link: https://lore.kernel.org/r/20220510102723.87666-1-mie@igel.co.jp
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_ids.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 80d76b75bccd..7aa2eb766205 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -73,12 +73,12 @@
  * Virtio Transitional IDs
  */
 
-#define VIRTIO_TRANS_ID_NET		1000 /* transitional virtio net */
-#define VIRTIO_TRANS_ID_BLOCK		1001 /* transitional virtio block */
-#define VIRTIO_TRANS_ID_BALLOON		1002 /* transitional virtio balloon */
-#define VIRTIO_TRANS_ID_CONSOLE		1003 /* transitional virtio console */
-#define VIRTIO_TRANS_ID_SCSI		1004 /* transitional virtio SCSI */
-#define VIRTIO_TRANS_ID_RNG		1005 /* transitional virtio rng */
-#define VIRTIO_TRANS_ID_9P		1009 /* transitional virtio 9p console */
+#define VIRTIO_TRANS_ID_NET		0x1000 /* transitional virtio net */
+#define VIRTIO_TRANS_ID_BLOCK		0x1001 /* transitional virtio block */
+#define VIRTIO_TRANS_ID_BALLOON		0x1002 /* transitional virtio balloon */
+#define VIRTIO_TRANS_ID_CONSOLE		0x1003 /* transitional virtio console */
+#define VIRTIO_TRANS_ID_SCSI		0x1004 /* transitional virtio SCSI */
+#define VIRTIO_TRANS_ID_RNG		0x1005 /* transitional virtio rng */
+#define VIRTIO_TRANS_ID_9P		0x1009 /* transitional virtio 9p console */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
-- 
cgit v1.2.3


From c23d47abee3a54e4991ed3993340596d04aabd6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Apr 2022 08:33:03 +0200
Subject: loop: remove most the top-of-file boilerplate comment from the UAPI
 header

Just leave the SPDX marker and the copyright notice and remove the
irrelevant rest.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220419063303.583106-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/loop.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index 98e60801195e..6f63527dd2ed 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -1,11 +1,6 @@
 /* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
 /*
- * include/linux/loop.h
- *
- * Written by Theodore Ts'o, 3/29/93.
- *
- * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
- * permitted under the GNU General Public License.
+ * Copyright 1993 by Theodore Ts'o.
  */
 #ifndef _UAPI_LINUX_LOOP_H
 #define _UAPI_LINUX_LOOP_H
-- 
cgit v1.2.3


From 26101f5ab6bdf30ac25c8e578e0b4873e7849e0c Mon Sep 17 00:00:00 2001
From: Kaixi Fan <fankaixi.li@bytedance.com>
Date: Sat, 30 Apr 2022 15:48:42 +0800
Subject: bpf: Add source ip in "struct bpf_tunnel_key"

Add tunnel source ip field in "struct bpf_tunnel_key". Add related code
to set and get tunnel source field.

Signed-off-by: Kaixi Fan <fankaixi.li@bytedance.com>
Link: https://lore.kernel.org/r/20220430074844.69214-2-fankaixi.li@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 4 ++++
 net/core/filter.c              | 9 +++++++++
 tools/include/uapi/linux/bpf.h | 4 ++++
 3 files changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 444fe6f1cf35..95a3d1ff6255 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5604,6 +5604,10 @@ struct bpf_tunnel_key {
 	__u8 tunnel_ttl;
 	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
+	union {
+		__u32 local_ipv4;
+		__u32 local_ipv6[4];
+	};
 };
 
 /* user accessible mirror of in-kernel xfrm_state.
diff --git a/net/core/filter.c b/net/core/filter.c
index b741b9f7e6a9..fe0da529d00f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4498,6 +4498,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		err = -EINVAL;
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
 		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, tunnel_ext):
 			goto set_compat;
@@ -4523,10 +4524,14 @@ set_compat:
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
 		       sizeof(to->remote_ipv6));
+		memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
+		       sizeof(to->local_ipv6));
 		to->tunnel_label = be32_to_cpu(info->key.label);
 	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
 		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+		to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
+		memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
 		to->tunnel_label = 0;
 	}
 
@@ -4597,6 +4602,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
 		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, tunnel_ext):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
@@ -4639,10 +4645,13 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		info->mode |= IP_TUNNEL_INFO_IPV6;
 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
 		       sizeof(from->remote_ipv6));
+		memcpy(&info->key.u.ipv6.src, from->local_ipv6,
+		       sizeof(from->local_ipv6));
 		info->key.label = cpu_to_be32(from->tunnel_label) &
 				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+		info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
 	}
 
 	return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 444fe6f1cf35..95a3d1ff6255 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5604,6 +5604,10 @@ struct bpf_tunnel_key {
 	__u8 tunnel_ttl;
 	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
+	union {
+		__u32 local_ipv4;
+		__u32 local_ipv6[4];
+	};
 };
 
 /* user accessible mirror of in-kernel xfrm_state.
-- 
cgit v1.2.3


From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Tue, 10 May 2022 13:59:19 -0700
Subject: bpf, x86: Generate trampolines from bpf_tramp_links

Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect
struct bpf_tramp_link(s) for a trampoline.  struct bpf_tramp_link
extends bpf_link to act as a linked list node.

arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to
collects all bpf_tramp_link(s) that a trampoline should call.

Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links
instead of bpf_tramp_progs.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com
---
 arch/x86/net/bpf_jit_comp.c    | 36 +++++++++++----------
 include/linux/bpf.h            | 36 ++++++++++++++-------
 include/linux/bpf_types.h      |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/bpf_struct_ops.c    | 71 +++++++++++++++++++++++++++-------------
 kernel/bpf/syscall.c           | 23 +++++--------
 kernel/bpf/trampoline.c        | 73 ++++++++++++++++++++++++------------------
 net/bpf/bpf_dummy_struct_ops.c | 24 +++++++++++---
 tools/bpf/bpftool/link.c       |  1 +
 tools/include/uapi/linux/bpf.h |  1 +
 10 files changed, 164 insertions(+), 103 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 16b6efacf7c6..38eb43159230 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1762,10 +1762,12 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 }
 
 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
-			   struct bpf_prog *p, int stack_size, bool save_ret)
+			   struct bpf_tramp_link *l, int stack_size,
+			   bool save_ret)
 {
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
+	struct bpf_prog *p = l->link.prog;
 
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
@@ -1850,14 +1852,14 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
 }
 
 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
-		      struct bpf_tramp_progs *tp, int stack_size,
+		      struct bpf_tramp_links *tl, int stack_size,
 		      bool save_ret)
 {
 	int i;
 	u8 *prog = *pprog;
 
-	for (i = 0; i < tp->nr_progs; i++) {
-		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size,
+	for (i = 0; i < tl->nr_links; i++) {
+		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
 				    save_ret))
 			return -EINVAL;
 	}
@@ -1866,7 +1868,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 }
 
 static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
-			      struct bpf_tramp_progs *tp, int stack_size,
+			      struct bpf_tramp_links *tl, int stack_size,
 			      u8 **branches)
 {
 	u8 *prog = *pprog;
@@ -1877,8 +1879,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 	 */
 	emit_mov_imm32(&prog, false, BPF_REG_0, 0);
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
-	for (i = 0; i < tp->nr_progs; i++) {
-		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true))
+	for (i = 0; i < tl->nr_links; i++) {
+		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, true))
 			return -EINVAL;
 
 		/* mod_ret prog stored return value into [rbp - 8]. Emit:
@@ -1980,14 +1982,14 @@ static bool is_valid_bpf_tramp_flags(unsigned int flags)
  */
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_progs *tprogs,
+				struct bpf_tramp_links *tlinks,
 				void *orig_call)
 {
 	int ret, i, nr_args = m->nr_args;
 	int regs_off, ip_off, args_off, stack_size = nr_args * 8;
-	struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
-	struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
 	u8 **branches = NULL;
 	u8 *prog;
 	bool save_ret;
@@ -2078,13 +2080,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		}
 	}
 
-	if (fentry->nr_progs)
+	if (fentry->nr_links)
 		if (invoke_bpf(m, &prog, fentry, regs_off,
 			       flags & BPF_TRAMP_F_RET_FENTRY_RET))
 			return -EINVAL;
 
-	if (fmod_ret->nr_progs) {
-		branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *),
+	if (fmod_ret->nr_links) {
+		branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
 				   GFP_KERNEL);
 		if (!branches)
 			return -ENOMEM;
@@ -2111,7 +2113,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		prog += X86_PATCH_SIZE;
 	}
 
-	if (fmod_ret->nr_progs) {
+	if (fmod_ret->nr_links) {
 		/* From Intel 64 and IA-32 Architectures Optimization
 		 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
 		 * Coding Rule 11: All branch targets should be 16-byte
@@ -2121,12 +2123,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		/* Update the branches saved in invoke_bpf_mod_ret with the
 		 * aligned address of do_fexit.
 		 */
-		for (i = 0; i < fmod_ret->nr_progs; i++)
+		for (i = 0; i < fmod_ret->nr_links; i++)
 			emit_cond_near_jump(&branches[i], prog, branches[i],
 					    X86_JNE);
 	}
 
-	if (fexit->nr_progs)
+	if (fexit->nr_links)
 		if (invoke_bpf(m, &prog, fexit, regs_off, false)) {
 			ret = -EINVAL;
 			goto cleanup;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 551b7198ae8a..75e0110a65e1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -723,11 +723,11 @@ struct btf_func_model {
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
  */
-#define BPF_MAX_TRAMP_PROGS 38
+#define BPF_MAX_TRAMP_LINKS 38
 
-struct bpf_tramp_progs {
-	struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS];
-	int nr_progs;
+struct bpf_tramp_links {
+	struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
+	int nr_links;
 };
 
 /* Different use cases for BPF trampoline:
@@ -753,7 +753,7 @@ struct bpf_tramp_progs {
 struct bpf_tramp_image;
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_progs *tprogs,
+				struct bpf_tramp_links *tlinks,
 				void *orig_call);
 /* these two functions are called from generated trampoline */
 u64 notrace __bpf_prog_enter(struct bpf_prog *prog);
@@ -852,9 +852,10 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
 {
 	return bpf_func(ctx, insnsi);
 }
+
 #ifdef CONFIG_BPF_JIT
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr);
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr);
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
@@ -905,12 +906,12 @@ int bpf_jit_charge_modmem(u32 size);
 void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
-static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
+static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 					   struct bpf_trampoline *tr)
 {
 	return -ENOTSUPP;
 }
-static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog,
+static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 					     struct bpf_trampoline *tr)
 {
 	return -ENOTSUPP;
@@ -1009,7 +1010,6 @@ struct bpf_prog_aux {
 	bool tail_call_reachable;
 	bool xdp_has_frags;
 	bool use_bpf_prog_pack;
-	struct hlist_node tramp_hlist;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -1096,6 +1096,18 @@ struct bpf_link_ops {
 			      struct bpf_link_info *info);
 };
 
+struct bpf_tramp_link {
+	struct bpf_link link;
+	struct hlist_node tramp_hlist;
+};
+
+struct bpf_tracing_link {
+	struct bpf_tramp_link link;
+	enum bpf_attach_type attach_type;
+	struct bpf_trampoline *trampoline;
+	struct bpf_prog *tgt_prog;
+};
+
 struct bpf_link_primer {
 	struct bpf_link *link;
 	struct file *file;
@@ -1133,8 +1145,8 @@ bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 				       void *value);
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
-				      struct bpf_prog *prog,
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
 				      void *image, void *image_end);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 3e24ad0c4b3c..2b9112b80171 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -141,3 +141,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 #endif
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
+BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 95a3d1ff6255..3d032ea1b6a3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1013,6 +1013,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
+	BPF_LINK_TYPE_STRUCT_OPS = 9,
 
 	MAX_BPF_LINK_TYPE,
 };
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3a0103ad97bc..d9a3c9207240 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -33,15 +33,15 @@ struct bpf_struct_ops_map {
 	const struct bpf_struct_ops *st_ops;
 	/* protect map_update */
 	struct mutex lock;
-	/* progs has all the bpf_prog that is populated
+	/* link has all the bpf_links that is populated
 	 * to the func ptr of the kernel's struct
 	 * (in kvalue.data).
 	 */
-	struct bpf_prog **progs;
+	struct bpf_link **links;
 	/* image is a page that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
 	 * A PAGE_SIZE "image" is enough to store all trampoline for
-	 * "progs[]".
+	 * "links[]".
 	 */
 	void *image;
 	/* uvalue->data stores the kernel struct
@@ -283,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 	u32 i;
 
 	for (i = 0; i < btf_type_vlen(t); i++) {
-		if (st_map->progs[i]) {
-			bpf_prog_put(st_map->progs[i]);
-			st_map->progs[i] = NULL;
+		if (st_map->links[i]) {
+			bpf_link_put(st_map->links[i]);
+			st_map->links[i] = NULL;
 		}
 	}
 }
@@ -316,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data)
 	return 0;
 }
 
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
-				      struct bpf_prog *prog,
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+	kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+	.release = bpf_struct_ops_link_release,
+	.dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
 				      void *image, void *image_end)
 {
 	u32 flags;
 
-	tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
-	tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
 	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
-					   model, flags, tprogs, NULL);
+					   model, flags, tlinks, NULL);
 }
 
 static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -338,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	const struct btf_member *member;
 	const struct btf_type *t = st_ops->type;
-	struct bpf_tramp_progs *tprogs = NULL;
+	struct bpf_tramp_links *tlinks = NULL;
 	void *udata, *kdata;
 	int prog_fd, err = 0;
 	void *image, *image_end;
@@ -362,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (uvalue->state || refcount_read(&uvalue->refcnt))
 		return -EINVAL;
 
-	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
-	if (!tprogs)
+	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+	if (!tlinks)
 		return -ENOMEM;
 
 	uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -386,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	for_each_member(i, t, member) {
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
+		struct bpf_tramp_link *link;
 		u32 moff;
 
 		moff = __btf_member_bit_offset(t, member) / 8;
@@ -439,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			err = PTR_ERR(prog);
 			goto reset_unlock;
 		}
-		st_map->progs[i] = prog;
 
 		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
 		    prog->aux->attach_btf_id != st_ops->type_id ||
 		    prog->expected_attach_type != i) {
+			bpf_prog_put(prog);
 			err = -EINVAL;
 			goto reset_unlock;
 		}
 
-		err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+		link = kzalloc(sizeof(*link), GFP_USER);
+		if (!link) {
+			bpf_prog_put(prog);
+			err = -ENOMEM;
+			goto reset_unlock;
+		}
+		bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+			      &bpf_struct_ops_link_lops, prog);
+		st_map->links[i] = &link->link;
+
+		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 							&st_ops->func_models[i],
 							image, image_end);
 		if (err < 0)
@@ -491,7 +518,7 @@ reset_unlock:
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
 unlock:
-	kfree(tprogs);
+	kfree(tlinks);
 	mutex_unlock(&st_map->lock);
 	return err;
 }
@@ -546,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
-	if (st_map->progs)
+	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
-	bpf_map_area_free(st_map->progs);
+	bpf_map_area_free(st_map->links);
 	bpf_jit_free_exec(st_map->image);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
@@ -597,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	map = &st_map->map;
 
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
-	st_map->progs =
-		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+	st_map->links =
+		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
 				   NUMA_NO_NODE);
 	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
-	if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+	if (!st_map->uvalue || !st_map->links || !st_map->image) {
 		bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50164d324eaf..2dc582773344 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2864,19 +2864,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 }
 EXPORT_SYMBOL(bpf_link_get_from_fd);
 
-struct bpf_tracing_link {
-	struct bpf_link link;
-	enum bpf_attach_type attach_type;
-	struct bpf_trampoline *trampoline;
-	struct bpf_prog *tgt_prog;
-};
-
 static void bpf_tracing_link_release(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
 						tr_link->trampoline));
 
 	bpf_trampoline_put(tr_link->trampoline);
@@ -2889,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
 static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	kfree(tr_link);
 }
@@ -2898,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
 					 struct seq_file *seq)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	seq_printf(seq,
 		   "attach_type:\t%d\n",
@@ -2909,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 					   struct bpf_link_info *info)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	info->tracing.attach_type = tr_link->attach_type;
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -2990,7 +2983,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		err = -ENOMEM;
 		goto out_put_prog;
 	}
-	bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
 		      &bpf_tracing_link_lops, prog);
 	link->attach_type = prog->expected_attach_type;
 
@@ -3060,11 +3053,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		tgt_prog = prog->aux->dst_prog;
 	}
 
-	err = bpf_link_prime(&link->link, &link_primer);
+	err = bpf_link_prime(&link->link.link, &link_primer);
 	if (err)
 		goto out_unlock;
 
-	err = bpf_trampoline_link_prog(prog, tr);
+	err = bpf_trampoline_link_prog(&link->link, tr);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ada97751ae1b..d5e6bc5517cb 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -168,30 +168,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	return ret;
 }
 
-static struct bpf_tramp_progs *
+static struct bpf_tramp_links *
 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
 {
-	const struct bpf_prog_aux *aux;
-	struct bpf_tramp_progs *tprogs;
-	struct bpf_prog **progs;
+	struct bpf_tramp_link *link;
+	struct bpf_tramp_links *tlinks;
+	struct bpf_tramp_link **links;
 	int kind;
 
 	*total = 0;
-	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
-	if (!tprogs)
+	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+	if (!tlinks)
 		return ERR_PTR(-ENOMEM);
 
 	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
-		tprogs[kind].nr_progs = tr->progs_cnt[kind];
+		tlinks[kind].nr_links = tr->progs_cnt[kind];
 		*total += tr->progs_cnt[kind];
-		progs = tprogs[kind].progs;
+		links = tlinks[kind].links;
 
-		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
-			*ip_arg |= aux->prog->call_get_func_ip;
-			*progs++ = aux->prog;
+		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+			*ip_arg |= link->link.prog->call_get_func_ip;
+			*links++ = link;
 		}
 	}
-	return tprogs;
+	return tlinks;
 }
 
 static void __bpf_tramp_image_put_deferred(struct work_struct *work)
@@ -330,14 +330,14 @@ out:
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
 	struct bpf_tramp_image *im;
-	struct bpf_tramp_progs *tprogs;
+	struct bpf_tramp_links *tlinks;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 	bool ip_arg = false;
 	int err, total;
 
-	tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
-	if (IS_ERR(tprogs))
-		return PTR_ERR(tprogs);
+	tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+	if (IS_ERR(tlinks))
+		return PTR_ERR(tlinks);
 
 	if (total == 0) {
 		err = unregister_fentry(tr, tr->cur_image->image);
@@ -353,15 +353,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	}
 
-	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
-	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
+	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
 	if (ip_arg)
 		flags |= BPF_TRAMP_F_IP_ARG;
 
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
-					  &tr->func.model, flags, tprogs,
+					  &tr->func.model, flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
@@ -381,7 +381,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	tr->cur_image = im;
 	tr->selector++;
 out:
-	kfree(tprogs);
+	kfree(tlinks);
 	return err;
 }
 
@@ -407,13 +407,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 	}
 }
 
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
+	struct bpf_tramp_link *link_exiting;
 	int err = 0;
 	int cnt;
 
-	kind = bpf_attach_type_to_tramp(prog);
+	kind = bpf_attach_type_to_tramp(link->link.prog);
 	mutex_lock(&tr->mutex);
 	if (tr->extension_prog) {
 		/* cannot attach fentry/fexit if extension prog is attached.
@@ -429,25 +430,33 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 			err = -EBUSY;
 			goto out;
 		}
-		tr->extension_prog = prog;
+		tr->extension_prog = link->link.prog;
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
-					 prog->bpf_func);
+					 link->link.prog->bpf_func);
 		goto out;
 	}
-	if (cnt >= BPF_MAX_TRAMP_PROGS) {
+	if (cnt >= BPF_MAX_TRAMP_LINKS) {
 		err = -E2BIG;
 		goto out;
 	}
-	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+	if (!hlist_unhashed(&link->tramp_hlist)) {
 		/* prog already linked */
 		err = -EBUSY;
 		goto out;
 	}
-	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+	hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+		if (link_exiting->link.prog != link->link.prog)
+			continue;
+		/* prog already linked */
+		err = -EBUSY;
+		goto out;
+	}
+
+	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
 	tr->progs_cnt[kind]++;
 	err = bpf_trampoline_update(tr);
 	if (err) {
-		hlist_del_init(&prog->aux->tramp_hlist);
+		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
 	}
 out:
@@ -456,12 +465,12 @@ out:
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
 	int err;
 
-	kind = bpf_attach_type_to_tramp(prog);
+	kind = bpf_attach_type_to_tramp(link->link.prog);
 	mutex_lock(&tr->mutex);
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
@@ -470,7 +479,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 		tr->extension_prog = NULL;
 		goto out;
 	}
-	hlist_del_init(&prog->aux->tramp_hlist);
+	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
 	err = bpf_trampoline_update(tr);
 out:
@@ -635,7 +644,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 int __weak
 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
-			    struct bpf_tramp_progs *tprogs,
+			    struct bpf_tramp_links *tlinks,
 			    void *orig_call)
 {
 	return -ENOTSUPP;
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index d0e54e30658a..e78dadfc5829 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -72,13 +72,16 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args)
 		    args->args[3], args->args[4]);
 }
 
+extern const struct bpf_link_ops bpf_struct_ops_link_lops;
+
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr)
 {
 	const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops;
 	const struct btf_type *func_proto;
 	struct bpf_dummy_ops_test_args *args;
-	struct bpf_tramp_progs *tprogs;
+	struct bpf_tramp_links *tlinks;
+	struct bpf_tramp_link *link = NULL;
 	void *image = NULL;
 	unsigned int op_idx;
 	int prog_ret;
@@ -92,8 +95,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (IS_ERR(args))
 		return PTR_ERR(args);
 
-	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
-	if (!tprogs) {
+	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+	if (!tlinks) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -105,8 +108,17 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	}
 	set_vm_flush_reset_perms(image);
 
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out;
+	}
+	/* prog doesn't take the ownership of the reference from caller */
+	bpf_prog_inc(prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog);
+
 	op_idx = prog->expected_attach_type;
-	err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 						&st_ops->func_models[op_idx],
 						image, image + PAGE_SIZE);
 	if (err < 0)
@@ -124,7 +136,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 out:
 	kfree(args);
 	bpf_jit_free_exec(image);
-	kfree(tprogs);
+	if (link)
+		bpf_link_put(&link->link);
+	kfree(tlinks);
 	return err;
 }
 
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index 8fb0116f9136..6353a789322b 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -23,6 +23,7 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_XDP]			= "xdp",
 	[BPF_LINK_TYPE_PERF_EVENT]		= "perf_event",
 	[BPF_LINK_TYPE_KPROBE_MULTI]		= "kprobe_multi",
+	[BPF_LINK_TYPE_STRUCT_OPS]               = "struct_ops",
 };
 
 static struct hashmap *link_table;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 95a3d1ff6255..3d032ea1b6a3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1013,6 +1013,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
+	BPF_LINK_TYPE_STRUCT_OPS = 9,
 
 	MAX_BPF_LINK_TYPE,
 };
-- 
cgit v1.2.3


From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Tue, 10 May 2022 13:59:21 -0700
Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm.

Pass a cookie along with BPF_LINK_CREATE requests.

Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie.
The cookie of a bpf_tracing_link is available by calling
bpf_get_attach_cookie when running the BPF program of the attached
link.

The value of a cookie will be set at bpf_tramp_run_ctx by the
trampoline of the link.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com
---
 arch/x86/net/bpf_jit_comp.c    |  5 +++--
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/bpf_lsm.c           | 17 +++++++++++++++++
 kernel/bpf/syscall.c           | 12 ++++++++----
 kernel/bpf/trampoline.c        |  7 +++++--
 kernel/trace/bpf_trace.c       | 17 +++++++++++++++++
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 8 files changed, 69 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 1fbc5cf1c7a7..a2b6d197c226 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1769,9 +1769,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	u8 *jmp_insn;
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 	struct bpf_prog *p = l->link.prog;
+	u64 cookie = l->cookie;
 
-	/* mov rdi, 0 */
-	emit_mov_imm64(&prog, BPF_REG_1, 0, 0);
+	/* mov rdi, cookie */
+	emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
 
 	/* Prepare struct bpf_tramp_run_ctx.
 	 *
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 256fb802e580..aba7ded56436 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1102,6 +1102,7 @@ struct bpf_link_ops {
 struct bpf_tramp_link {
 	struct bpf_link link;
 	struct hlist_node tramp_hlist;
+	u64 cookie;
 };
 
 struct bpf_tracing_link {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3d032ea1b6a3..bc7f89948f54 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1490,6 +1490,15 @@ union bpf_attr {
 				__aligned_u64	addrs;
 				__aligned_u64	cookies;
 			} kprobe_multi;
+			struct {
+				/* this is overlaid with the target_btf_id above. */
+				__u32		target_btf_id;
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		cookie;
+			} tracing;
 		};
 	} link_create;
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 064eccba641d..c1351df9f7ee 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = {
 	.allowed	= bpf_ima_inode_hash_allowed,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+	.func		= bpf_get_attach_cookie,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 static const struct bpf_func_proto *
 bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
 	case BPF_FUNC_ima_file_hash:
 		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+	case BPF_FUNC_get_attach_cookie:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d48165fccf49..72e53489165d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2921,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
 
 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 				   int tgt_prog_fd,
-				   u32 btf_id)
+				   u32 btf_id,
+				   u64 bpf_cookie)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_prog *tgt_prog = NULL;
@@ -2986,6 +2987,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
 		      &bpf_tracing_link_lops, prog);
 	link->attach_type = prog->expected_attach_type;
+	link->link.cookie = bpf_cookie;
 
 	mutex_lock(&prog->aux->dst_mutex);
 
@@ -3271,7 +3273,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 			tp_name = prog->aux->attach_func_name;
 			break;
 		}
-		return bpf_tracing_prog_attach(prog, 0, 0);
+		return bpf_tracing_prog_attach(prog, 0, 0, 0);
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
@@ -4524,7 +4526,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_EXT:
 		ret = bpf_tracing_prog_attach(prog,
 					      attr->link_create.target_fd,
-					      attr->link_create.target_btf_id);
+					      attr->link_create.target_btf_id,
+					      attr->link_create.tracing.cookie);
 		break;
 	case BPF_PROG_TYPE_LSM:
 	case BPF_PROG_TYPE_TRACING:
@@ -4539,7 +4542,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
-						      attr->link_create.target_btf_id);
+						      attr->link_create.target_btf_id,
+						      attr->link_create.tracing.cookie);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_SK_LOOKUP:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index baf1b65d523e..0e9b3aefc34a 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	enum bpf_attach_type eatype = prog->expected_attach_type;
+	enum bpf_prog_type ptype = prog->type;
 
-	return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-	       eatype == BPF_MODIFY_RETURN;
+	return (ptype == BPF_PROG_TYPE_TRACING &&
+		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+		 eatype == BPF_MODIFY_RETURN)) ||
+		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
 void *bpf_jit_alloc_exec_page(void)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7fd11c17558d..2eaac094caf8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1091,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
+	.func		= bpf_get_attach_cookie_tracing,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
 {
 #ifndef CONFIG_X86
@@ -1719,6 +1734,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
 	case BPF_FUNC_get_func_arg_cnt:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+	case BPF_FUNC_get_attach_cookie:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
 	default:
 		fn = raw_tp_prog_func_proto(func_id, prog);
 		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3d032ea1b6a3..bc7f89948f54 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1490,6 +1490,15 @@ union bpf_attr {
 				__aligned_u64	addrs;
 				__aligned_u64	cookies;
 			} kprobe_multi;
+			struct {
+				/* this is overlaid with the target_btf_id above. */
+				__u32		target_btf_id;
+				/* black box user-provided value passed through
+				 * to BPF program at the execution time and
+				 * accessible through bpf_get_attach_cookie() BPF helper
+				 */
+				__u64		cookie;
+			} tracing;
 		};
 	} link_create;
 
-- 
cgit v1.2.3


From a8641d7d8500d41d312350470464e03f3df3672a Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Mon, 9 May 2022 18:50:25 +0530
Subject: HID: amd_sfh: Move bus declaration outside of amd-sfh

This should allow external drivers to reference this bus ID
reservation and detect data coming from amd-sfh.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/amd-sfh-hid/amd_sfh_hid.h | 1 -
 include/uapi/linux/input.h            | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h
index cb04f47c8648..ad264db63180 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_hid.h
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_hid.h
@@ -12,7 +12,6 @@
 #define AMDSFH_HID_H
 
 #define MAX_HID_DEVICES		5
-#define BUS_AMD_SFH		0x20
 #define AMD_SFH_HID_VENDOR	0x1022
 #define AMD_SFH_HID_PRODUCT	0x0001
 
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index ee3127461ee0..ef4257ab3026 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -271,6 +271,7 @@ struct input_mask {
 #define BUS_RMI			0x1D
 #define BUS_CEC			0x1E
 #define BUS_INTEL_ISHTP		0x1F
+#define BUS_AMD_SFH		0x20
 
 /*
  * MT_TOOL types
-- 
cgit v1.2.3


From ee692a21e9bf8354bd3ec816f1cf4bff8619ed77 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 May 2022 11:17:45 +0530
Subject: fs,io_uring: add infrastructure for uring-cmd

file_operations->uring_cmd is a file private handler.
This is somewhat similar to ioctl but hopefully a lot more sane and
useful as it can be used to enable many io_uring capabilities for the
underlying operation.

IORING_OP_URING_CMD is a file private kind of request. io_uring doesn't
know what is in this command type, it's for the provider of ->uring_cmd()
to deal with.

Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220511054750.20432-2-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 135 ++++++++++++++++++++++++++++++++++++------
 include/linux/fs.h            |   2 +
 include/linux/io_uring.h      |  33 +++++++++++
 include/uapi/linux/io_uring.h |  21 ++++---
 4 files changed, 165 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ceaf7826ed71..44c57dca358d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,13 +202,6 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
-enum io_uring_cmd_flags {
-	IO_URING_F_COMPLETE_DEFER	= 1,
-	IO_URING_F_UNLOCKED		= 2,
-	/* int's last bit, sign checks are usually faster than a bit test */
-	IO_URING_F_NONBLOCK		= INT_MIN,
-};
-
 struct io_mapped_ubuf {
 	u64		ubuf;
 	u64		ubuf_end;
@@ -972,6 +965,7 @@ struct io_kiocb {
 		struct io_xattr		xattr;
 		struct io_socket	sock;
 		struct io_nop		nop;
+		struct io_uring_cmd	uring_cmd;
 	};
 
 	u8				opcode;
@@ -1050,6 +1044,14 @@ struct io_cancel_data {
 	int seq;
 };
 
+/*
+ * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
+ * the following sqe if SQE128 is used.
+ */
+#define uring_cmd_pdu_size(is_sqe128)				\
+	((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) -	\
+		offsetof(struct io_uring_sqe, cmd))
+
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
@@ -1289,6 +1291,12 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,
 	},
+	[IORING_OP_URING_CMD] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.needs_async_setup	= 1,
+		.async_size		= uring_cmd_pdu_size(1),
+	},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -1428,6 +1436,8 @@ const char *io_uring_get_opcode(u8 opcode)
 		return "GETXATTR";
 	case IORING_OP_SOCKET:
 		return "SOCKET";
+	case IORING_OP_URING_CMD:
+		return "URING_CMD";
 	case IORING_OP_LAST:
 		return "INVALID";
 	}
@@ -4507,10 +4517,6 @@ static int __io_getxattr_prep(struct io_kiocb *req,
 	const char __user *name;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio))
-		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
 
@@ -4620,10 +4626,6 @@ static int __io_setxattr_prep(struct io_kiocb *req,
 	const char __user *name;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (unlikely(sqe->ioprio))
-		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 		return -EBADF;
 
@@ -4910,6 +4912,96 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+{
+	req->uring_cmd.task_work_cb(&req->uring_cmd);
+}
+
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *))
+{
+	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+	req->uring_cmd.task_work_cb = task_work_cb;
+	req->io_task_work.func = io_uring_cmd_work;
+	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+
+/*
+ * Called by consumers of io_uring_cmd, if they originally returned
+ * -EIOCBQUEUED upon receiving the command.
+ */
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
+{
+	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+	if (ret < 0)
+		req_set_fail(req);
+	if (req->ctx->flags & IORING_SETUP_CQE32)
+		__io_req_complete32(req, 0, ret, 0, res2, 0);
+	else
+		io_req_complete(req, ret);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+
+static int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+	size_t cmd_size;
+
+	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
+
+	memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
+	return 0;
+}
+
+static int io_uring_cmd_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+
+	if (sqe->rw_flags)
+		return -EINVAL;
+	ioucmd->cmd = sqe->cmd;
+	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+	return 0;
+}
+
+static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file = req->file;
+	int ret;
+
+	if (!req->file->f_op->uring_cmd)
+		return -EOPNOTSUPP;
+
+	if (ctx->flags & IORING_SETUP_SQE128)
+		issue_flags |= IO_URING_F_SQE128;
+	if (ctx->flags & IORING_SETUP_CQE32)
+		issue_flags |= IO_URING_F_CQE32;
+	if (ctx->flags & IORING_SETUP_IOPOLL)
+		issue_flags |= IO_URING_F_IOPOLL;
+
+	if (req_has_async_data(req))
+		ioucmd->cmd = req->async_data;
+
+	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+	if (ret == -EAGAIN) {
+		if (!req_has_async_data(req)) {
+			if (io_alloc_async_data(req))
+				return -ENOMEM;
+			io_uring_cmd_prep_async(req);
+		}
+		return -EAGAIN;
+	}
+
+	if (ret != -EIOCBQUEUED)
+		io_uring_cmd_done(ioucmd, ret, 0);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -6305,9 +6397,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_socket *sock = &req->sock;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index)
+	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 
 	sock->domain = READ_ONCE(sqe->fd);
@@ -7755,6 +7845,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_getxattr_prep(req, sqe);
 	case IORING_OP_SOCKET:
 		return io_socket_prep(req, sqe);
+	case IORING_OP_URING_CMD:
+		return io_uring_cmd_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -7787,6 +7879,8 @@ static int io_req_prep_async(struct io_kiocb *req)
 		return io_recvmsg_prep_async(req);
 	case IORING_OP_CONNECT:
 		return io_connect_prep_async(req);
+	case IORING_OP_URING_CMD:
+		return io_uring_cmd_prep_async(req);
 	}
 	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
 		    req->opcode);
@@ -8081,6 +8175,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_SOCKET:
 		ret = io_socket(req, issue_flags);
 		break;
+	case IORING_OP_URING_CMD:
+		ret = io_uring_cmd(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -12699,6 +12796,8 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
 
+	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
+
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
 	return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbde95387a23..87b5af1d9fbe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1953,6 +1953,7 @@ struct dir_context {
 #define REMAP_FILE_ADVISORY		(REMAP_FILE_CAN_SHORTEN)
 
 struct iov_iter;
+struct io_uring_cmd;
 
 struct file_operations {
 	struct module *owner;
@@ -1995,6 +1996,7 @@ struct file_operations {
 				   struct file *file_out, loff_t pos_out,
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
+	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 } __randomize_layout;
 
 struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 24651c229ed2..4a2f6cc5a492 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,7 +5,32 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
+enum io_uring_cmd_flags {
+	IO_URING_F_COMPLETE_DEFER	= 1,
+	IO_URING_F_UNLOCKED		= 2,
+	/* int's last bit, sign checks are usually faster than a bit test */
+	IO_URING_F_NONBLOCK		= INT_MIN,
+
+	/* ctx state flags, for URING_CMD */
+	IO_URING_F_SQE128		= 4,
+	IO_URING_F_CQE32		= 8,
+	IO_URING_F_IOPOLL		= 16,
+};
+
+struct io_uring_cmd {
+	struct file	*file;
+	const void	*cmd;
+	/* callback to defer completions to task context */
+	void (*task_work_cb)(struct io_uring_cmd *cmd);
+	u32		cmd_op;
+	u32		pad;
+	u8		pdu[32]; /* available inline for free use */
+};
+
 #if defined(CONFIG_IO_URING)
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *));
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
@@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+		ssize_t ret2)
+{
+}
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *))
+{
+}
 static inline struct sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ac2d90d669c3..23618be55dd2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -22,6 +22,7 @@ struct io_uring_sqe {
 	union {
 		__u64	off;	/* offset into file */
 		__u64	addr2;
+		__u32	cmd_op;
 	};
 	union {
 		__u64	addr;	/* pointer to buffer or iovecs */
@@ -61,14 +62,17 @@ struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 	};
-	__u64	addr3;
-	__u64	__pad2[1];
-
-	/*
-	 * If the ring is initialized with IORING_SETUP_SQE128, then this field
-	 * contains 64-bytes of padding, doubling the size of the SQE.
-	 */
-	__u64	__big_sqe_pad[0];
+	union {
+		struct {
+			__u64	addr3;
+			__u64	__pad2[1];
+		};
+		/*
+		 * If the ring is initialized with IORING_SETUP_SQE128, then
+		 * this field is used for 80 bytes of arbitrary command data
+		 */
+		__u8	cmd[0];
+	};
 };
 
 enum {
@@ -175,6 +179,7 @@ enum io_uring_op {
 	IORING_OP_FGETXATTR,
 	IORING_OP_GETXATTR,
 	IORING_OP_SOCKET,
+	IORING_OP_URING_CMD,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
cgit v1.2.3


From 456cba386e94f22fa1b1426303fdcac9e66b1417 Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Wed, 11 May 2022 11:17:48 +0530
Subject: nvme: wire-up uring-cmd support for io-passthru on char-device.

Introduce handler for fops->uring_cmd(), implementing async passthru
on char device (/dev/ngX). The handler supports newly introduced
operation NVME_URING_CMD_IO. This operates on a new structure
nvme_uring_cmd, which is similar to struct nvme_passthru_cmd64 but
without the embedded 8b result field. This field is not needed since
uring-cmd allows to return additional result via big-CQE.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220511054750.20432-5-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c        |   1 +
 drivers/nvme/host/ioctl.c       | 192 +++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c   |   1 +
 drivers/nvme/host/nvme.h        |   4 +
 include/uapi/linux/nvme_ioctl.h |  25 ++++++
 5 files changed, 220 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e1846d04817f..682df98db341 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3699,6 +3699,7 @@ static const struct file_operations nvme_ns_chr_fops = {
 	.release	= nvme_ns_chr_release,
 	.unlocked_ioctl	= nvme_ns_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.uring_cmd	= nvme_ns_chr_uring_cmd,
 };
 
 static int nvme_add_ns_cdev(struct nvme_ns *ns)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 8d2569b656cc..92d695262d8f 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -5,6 +5,7 @@
  */
 #include <linux/ptrace.h>	/* for force_successful_syscall_return */
 #include <linux/nvme_ioctl.h>
+#include <linux/io_uring.h>
 #include "nvme.h"
 
 /*
@@ -66,7 +67,8 @@ static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
 static struct request *nvme_alloc_user_request(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, void **metap, unsigned timeout, bool vec)
+		u32 meta_seed, void **metap, unsigned timeout, bool vec,
+		unsigned int rq_flags, blk_mq_req_flags_t blk_flags)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -76,7 +78,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 	void *meta = NULL;
 	int ret;
 
-	req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0);
+	req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
 	if (IS_ERR(req))
 		return req;
 	nvme_init_request(req, cmd);
@@ -140,7 +142,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	int ret;
 
 	req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
-			meta_len, meta_seed, &meta, timeout, vec);
+			meta_len, meta_seed, &meta, timeout, vec, 0, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -330,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return status;
 }
 
+struct nvme_uring_data {
+	__u64	metadata;
+	__u64	addr;
+	__u32	data_len;
+	__u32	metadata_len;
+	__u32	timeout_ms;
+};
+
+/*
+ * This overlays struct io_uring_cmd pdu.
+ * Expect build errors if this grows larger than that.
+ */
+struct nvme_uring_cmd_pdu {
+	union {
+		struct bio *bio;
+		struct request *req;
+	};
+	void *meta; /* kernel-resident buffer */
+	void __user *meta_buffer;
+	u32 meta_len;
+};
+
+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	struct request *req = pdu->req;
+	struct bio *bio = req->bio;
+	int status;
+	u64 result;
+
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		status = -EINTR;
+	else
+		status = nvme_req(req)->status;
+
+	result = le64_to_cpu(nvme_req(req)->result.u64);
+
+	if (pdu->meta)
+		status = nvme_finish_user_metadata(req, pdu->meta_buffer,
+					pdu->meta, pdu->meta_len, status);
+	if (bio)
+		blk_rq_unmap_user(bio);
+	blk_mq_free_request(req);
+
+	io_uring_cmd_done(ioucmd, status, result);
+}
+
+static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	/* extract bio before reusing the same field for request */
+	struct bio *bio = pdu->bio;
+
+	pdu->req = req;
+	req->bio = bio;
+	/* this takes care of moving rest of completion-work to task context */
+	io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
+}
+
+static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	const struct nvme_uring_cmd *cmd = ioucmd->cmd;
+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
+	struct nvme_uring_data d;
+	struct nvme_command c;
+	struct request *req;
+	unsigned int rq_flags = 0;
+	blk_mq_req_flags_t blk_flags = 0;
+	void *meta = NULL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	c.common.opcode = READ_ONCE(cmd->opcode);
+	c.common.flags = READ_ONCE(cmd->flags);
+	if (c.common.flags)
+		return -EINVAL;
+
+	c.common.command_id = 0;
+	c.common.nsid = cpu_to_le32(cmd->nsid);
+	if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid)))
+		return -EINVAL;
+
+	c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2));
+	c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3));
+	c.common.metadata = 0;
+	c.common.dptr.prp1 = c.common.dptr.prp2 = 0;
+	c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10));
+	c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11));
+	c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12));
+	c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13));
+	c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
+	c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
+
+	d.metadata = READ_ONCE(cmd->metadata);
+	d.addr = READ_ONCE(cmd->addr);
+	d.data_len = READ_ONCE(cmd->data_len);
+	d.metadata_len = READ_ONCE(cmd->metadata_len);
+	d.timeout_ms = READ_ONCE(cmd->timeout_ms);
+
+	if (issue_flags & IO_URING_F_NONBLOCK) {
+		rq_flags = REQ_NOWAIT;
+		blk_flags = BLK_MQ_REQ_NOWAIT;
+	}
+
+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
+			d.data_len, nvme_to_user_ptr(d.metadata),
+			d.metadata_len, 0, &meta, d.timeout_ms ?
+			msecs_to_jiffies(d.timeout_ms) : 0, 0, rq_flags,
+			blk_flags);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->end_io_data = ioucmd;
+
+	/* to free bio on completion, as req->bio will be null at that time */
+	pdu->bio = req->bio;
+	pdu->meta = meta;
+	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
+	pdu->meta_len = d.metadata_len;
+
+	blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io);
+	return -EIOCBQUEUED;
+}
+
 static bool is_ctrl_ioctl(unsigned int cmd)
 {
 	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
@@ -421,6 +556,42 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return __nvme_ioctl(ns, cmd, (void __user *)arg);
 }
 
+static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
+			     unsigned int issue_flags)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
+
+	/* IOPOLL not supported yet */
+	if (issue_flags & IO_URING_F_IOPOLL)
+		return -EOPNOTSUPP;
+
+	/* NVMe passthrough requires bit SQE/CQE support */
+	if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
+	    (IO_URING_F_SQE128|IO_URING_F_CQE32))
+		return -EOPNOTSUPP;
+
+	switch (ioucmd->cmd_op) {
+	case NVME_URING_CMD_IO:
+		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+{
+	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
+			struct nvme_ns, cdev);
+
+	return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+}
+
 #ifdef CONFIG_NVME_MULTIPATH
 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 		void __user *argp, struct nvme_ns_head *head, int srcu_idx)
@@ -487,6 +658,21 @@ out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
+
+int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
+		unsigned int issue_flags)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+	int ret = -EINVAL;
+
+	if (ns)
+		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return ret;
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d464fdf978fb..d3e2440d8abb 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
 	.release	= nvme_ns_head_chr_release,
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a2b53ca63335..086ccbdd7003 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -782,6 +782,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
+		unsigned int issue_flags);
+int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
+		unsigned int issue_flags);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index b2e43185e3b5..04e458c649ab 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 {
 	__u64	result;
 };
 
+/* same as struct nvme_passthru_cmd64, minus the 8b result field */
+struct nvme_uring_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32   rsvd2;
+};
+
 #define nvme_admin_cmd nvme_passthru_cmd
 
 #define NVME_IOCTL_ID		_IO('N', 0x40)
@@ -83,4 +105,7 @@ struct nvme_passthru_cmd64 {
 #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64)
 #define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64)
 
+/* io_uring async commands: */
+#define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
+
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
cgit v1.2.3


From f569add47119fa910ed7711b26b8d38e21f7ea77 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Wed, 11 May 2022 11:17:49 +0530
Subject: nvme: add vectored-io support for uring-cmd

wire up support for async passthru that takes an array of buffers (using
iovec). Exposed via a new op NVME_URING_CMD_IO_VEC. Same 'struct
nvme_uring_cmd' is to be used with -

1. cmd.addr as base address of user iovec array
2. cmd.data_len as count of iovec array elements

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220511054750.20432-6-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c       | 9 ++++++---
 include/uapi/linux/nvme_ioctl.h | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 92d695262d8f..7b0e2c9cdcae 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -399,7 +399,7 @@ static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
 }
 
 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-		struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 	const struct nvme_uring_cmd *cmd = ioucmd->cmd;
@@ -449,7 +449,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
 			d.data_len, nvme_to_user_ptr(d.metadata),
 			d.metadata_len, 0, &meta, d.timeout_ms ?
-			msecs_to_jiffies(d.timeout_ms) : 0, 0, rq_flags,
+			msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags,
 			blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -575,7 +575,10 @@ static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
 
 	switch (ioucmd->cmd_op) {
 	case NVME_URING_CMD_IO:
-		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags);
+		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false);
+		break;
+	case NVME_URING_CMD_IO_VEC:
+		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true);
 		break;
 	default:
 		ret = -ENOTTY;
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 04e458c649ab..0b1876aa5a59 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -107,5 +107,6 @@ struct nvme_uring_cmd {
 
 /* io_uring async commands: */
 #define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
cgit v1.2.3


From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001
From: Feng Zhou <zhoufeng.zf@bytedance.com>
Date: Wed, 11 May 2022 17:38:53 +0800
Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map

Add new ebpf helpers bpf_map_lookup_percpu_elem.

The implementation method is relatively simple, refer to the implementation
method of map_lookup_elem of percpu map, increase the parameters of cpu, and
obtain it according to the specified cpu.

Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/arraymap.c          | 15 +++++++++++++++
 kernel/bpf/core.c              |  1 +
 kernel/bpf/hashtab.c           | 32 ++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           | 18 ++++++++++++++++++
 kernel/bpf/verifier.c          | 17 +++++++++++++++--
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 9 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ded8711457f..5061ccd8b2dc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -89,6 +89,7 @@ struct bpf_map_ops {
 	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
 	int (*map_pop_elem)(struct bpf_map *map, void *value);
 	int (*map_peek_elem)(struct bpf_map *map, void *value);
+	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
@@ -2184,6 +2185,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 extern const struct bpf_func_proto bpf_map_push_elem_proto;
 extern const struct bpf_func_proto bpf_map_pop_elem_proto;
 extern const struct bpf_func_proto bpf_map_peek_elem_proto;
+extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bc7f89948f54..0210f85131b3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5164,6 +5164,14 @@ union bpf_attr {
  *		if not NULL, is a reference which must be released using its
  *		corresponding release function, or moved into a BPF map before
  *		program exit.
+ *
+ * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
+ * 	Description
+ * 		Perform a lookup in *percpu map* for an entry associated to
+ * 		*key* on *cpu*.
+ * 	Return
+ * 		Map value associated to *key* on *cpu*, or **NULL** if no entry
+ * 		was found or *cpu* is invalid.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5361,6 +5369,7 @@ union bpf_attr {
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
+	FN(map_lookup_percpu_elem),     \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 724613da6576..fe40d3b9458f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -243,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -725,6 +739,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_lookup_elem = percpu_array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
+	.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
 	.map_seq_show_elem = percpu_array_map_seq_show_elem,
 	.map_check_btf = array_map_check_btf,
 	.map_lookup_batch = generic_map_lookup_batch,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..76f68d0a7ae8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2619,6 +2619,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_map_push_elem_proto __weak;
 const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
 const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
 const struct bpf_func_proto bpf_spin_lock_proto __weak;
 const struct bpf_func_proto bpf_spin_unlock_proto __weak;
 const struct bpf_func_proto bpf_jiffies64_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 705841279d16..17fb69c0e0dc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -2199,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 		return NULL;
 }
 
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct htab_elem *l;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	l = __htab_map_lookup_elem(map, key);
+	if (l)
+		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+	else
+		return NULL;
+}
+
 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -2211,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 	return NULL;
 }
 
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct htab_elem *l;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	l = __htab_map_lookup_elem(map, key);
+	if (l) {
+		bpf_lru_node_set_ref(&l->lru_node);
+		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+	}
+
+	return NULL;
+}
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct htab_elem *l;
@@ -2300,6 +2330,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_percpu_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
+	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
@@ -2318,6 +2349,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_lru_percpu_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
+	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3e709fed5306..d5f104a39092 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -119,6 +119,22 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
 };
 
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+	.func		= bpf_map_lookup_percpu_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_KEY,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
@@ -1420,6 +1436,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_pop_elem_proto;
 	case BPF_FUNC_map_peek_elem:
 		return &bpf_map_peek_elem_proto;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		return &bpf_map_lookup_percpu_elem_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c27fee73a2cb..05c1b6656824 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6137,6 +6137,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
 			goto error;
 		break;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+		    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+			goto error;
+		break;
 	case BPF_FUNC_sk_storage_get:
 	case BPF_FUNC_sk_storage_delete:
 		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -6750,7 +6756,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_pop_elem &&
 	    func_id != BPF_FUNC_map_peek_elem &&
 	    func_id != BPF_FUNC_for_each_map_elem &&
-	    func_id != BPF_FUNC_redirect_map)
+	    func_id != BPF_FUNC_redirect_map &&
+	    func_id != BPF_FUNC_map_lookup_percpu_elem)
 		return 0;
 
 	if (map == NULL) {
@@ -13810,7 +13817,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		     insn->imm == BPF_FUNC_map_pop_elem    ||
 		     insn->imm == BPF_FUNC_map_peek_elem   ||
 		     insn->imm == BPF_FUNC_redirect_map    ||
-		     insn->imm == BPF_FUNC_for_each_map_elem)) {
+		     insn->imm == BPF_FUNC_for_each_map_elem ||
+		     insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
@@ -13859,6 +13867,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 					      bpf_callback_t callback_fn,
 					      void *callback_ctx,
 					      u64 flags))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+				     (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
 
 patch_map_ops_generic:
 			switch (insn->imm) {
@@ -13886,6 +13896,9 @@ patch_map_ops_generic:
 			case BPF_FUNC_for_each_map_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
 				continue;
+			case BPF_FUNC_map_lookup_percpu_elem:
+				insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+				continue;
 			}
 
 			goto patch_call_imm;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2eaac094caf8..7141ca8a1c2d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1197,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_pop_elem_proto;
 	case BPF_FUNC_map_peek_elem:
 		return &bpf_map_peek_elem_proto;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		return &bpf_map_lookup_percpu_elem_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bc7f89948f54..0210f85131b3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5164,6 +5164,14 @@ union bpf_attr {
  *		if not NULL, is a reference which must be released using its
  *		corresponding release function, or moved into a BPF map before
  *		program exit.
+ *
+ * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
+ * 	Description
+ * 		Perform a lookup in *percpu map* for an entry associated to
+ * 		*key* on *cpu*.
+ * 	Return
+ * 		Map value associated to *key* on *cpu*, or **NULL** if no entry
+ * 		was found or *cpu* is invalid.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5361,6 +5369,7 @@ union bpf_attr {
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
+	FN(map_lookup_percpu_elem),     \
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From c9b516f16be5896a3d798f8efb03acbd2ceec715 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 12 May 2022 20:38:36 -0700
Subject: ELF, uapi: fixup ELF_ST_TYPE definition

This is very theoretical compile failure:

	ELF_ST_TYPE(st_info = A)

Cast will bind first and st_info will stop being lvalue:

	error: lvalue required as left operand of assignment

Given that the only use of this macro is

	ELF_ST_TYPE(sym->st_info)

where st_info is "unsigned char" I've decided to remove cast especially
given that companion macro ELF_ST_BIND doesn't use cast.

Link: https://lkml.kernel.org/r/Ymv7G1BeX4kt3obz@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 787c657bfae8..237f21a5e0f6 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -134,7 +134,7 @@ typedef __s64	Elf64_Sxword;
 #define STT_TLS     6
 
 #define ELF_ST_BIND(x)		((x) >> 4)
-#define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
+#define ELF_ST_TYPE(x)		((x) & 0xf)
 #define ELF32_ST_BIND(x)	ELF_ST_BIND(x)
 #define ELF32_ST_TYPE(x)	ELF_ST_TYPE(x)
 #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
-- 
cgit v1.2.3


From 783eb354fb3dcd598e8e7e8a2ed88c0fb6ce5d2f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Mon, 4 Apr 2022 15:19:41 +0900
Subject: agpgart.h: do not include <stdlib.h> from exported header

Commit 35d0f1d54ecd ("include/uapi/linux/agpgart.h: include stdlib.h in
userspace") included <stdlib.h> to fix the unknown size_t error, but
I do not think it is the right fix.

This header already uses __kernel_size_t a few lines below.

Replace the remaining size_t, and stop including <stdlib.h>.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
---
 include/uapi/linux/agpgart.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/agpgart.h b/include/uapi/linux/agpgart.h
index f5251045181a..9cc3448c0b5b 100644
--- a/include/uapi/linux/agpgart.h
+++ b/include/uapi/linux/agpgart.h
@@ -52,7 +52,6 @@
 
 #ifndef __KERNEL__
 #include <linux/types.h>
-#include <stdlib.h>
 
 struct agp_version {
 	__u16 major;
@@ -64,10 +63,10 @@ typedef struct _agp_info {
 	__u32 bridge_id;	/* bridge vendor/device         */
 	__u32 agp_mode;		/* mode info of bridge          */
 	unsigned long aper_base;/* base of aperture             */
-	size_t aper_size;	/* size of aperture             */
-	size_t pg_total;	/* max pages (swap + system)    */
-	size_t pg_system;	/* max pages (system)           */
-	size_t pg_used;		/* current pages used           */
+	__kernel_size_t aper_size;	/* size of aperture             */
+	__kernel_size_t pg_total;	/* max pages (swap + system)    */
+	__kernel_size_t pg_system;	/* max pages (system)           */
+	__kernel_size_t pg_used;	/* current pages used           */
 } agp_info;
 
 typedef struct _agp_setup {
-- 
cgit v1.2.3


From 1339f24b336db5ded9811f3fe7b948e0de207785 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 7 May 2022 14:18:44 -0600
Subject: io_uring: allow allocated fixed files for openat/openat2

If the application passes in IORING_FILE_INDEX_ALLOC as the file_slot,
then that's a hint to allocate a fixed file descriptor rather than have
one be passed in directly.

This can be useful for having io_uring manage the direct descriptor space.

Normal open direct requests will complete with 0 for success, and < 0
in case of error. If io_uring is asked to allocated the direct descriptor,
then the direct descriptor is returned in case of success.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 36 +++++++++++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  9 +++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8c40411a7e78..f448264a1067 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4697,7 +4697,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_openat_prep(req, sqe);
 }
 
-static int __maybe_unused io_file_bitmap_get(struct io_ring_ctx *ctx)
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
 	unsigned long nr = ctx->nr_user_files;
@@ -4722,6 +4722,36 @@ static int __maybe_unused io_file_bitmap_get(struct io_ring_ctx *ctx)
 	return -ENFILE;
 }
 
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+			       struct file *file, unsigned int file_slot)
+{
+	bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	if (alloc_slot) {
+		io_ring_submit_lock(ctx, issue_flags);
+		ret = io_file_bitmap_get(ctx);
+		if (unlikely(ret < 0)) {
+			io_ring_submit_unlock(ctx, issue_flags);
+			return ret;
+		}
+
+		file_slot = ret;
+	} else {
+		file_slot--;
+	}
+
+	ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+	if (alloc_slot) {
+		io_ring_submit_unlock(ctx, issue_flags);
+		if (!ret)
+			return file_slot;
+	}
+
+	return ret;
+}
+
 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
@@ -4777,8 +4807,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	if (!fixed)
 		fd_install(ret, file);
 	else
-		ret = io_install_fixed_file(req, file, issue_flags,
-					    req->open.file_slot - 1);
+		ret = io_fixed_fd_install(req, issue_flags, file,
+						req->open.file_slot);
 err:
 	putname(req->open.filename);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 06621a278cb6..b7f02a55032a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -63,6 +63,15 @@ struct io_uring_sqe {
 	__u64	__pad2[2];
 };
 
+/*
+ * If sqe->file_index is set to this for opcodes that instantiate a new
+ * direct descriptor (like openat/openat2/accept), then io_uring will allocate
+ * an available direct descriptor instead of having the application pass one
+ * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
+ * if the space is full.
+ */
+#define IORING_FILE_INDEX_ALLOC		(~0U)
+
 enum {
 	IOSQE_FIXED_FILE_BIT,
 	IOSQE_IO_DRAIN_BIT,
-- 
cgit v1.2.3


From a8da73a32b6e9271a613e5a0e90a8c35f40abeb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 9 May 2022 09:29:14 -0600
Subject: io_uring: add flag for allocating a fully sparse direct descriptor
 space

Currently to setup a fully sparse descriptor space upfront, the app needs
to alloate an array of the full size and memset it to -1 and then pass
that in. Make this a bit easier by allowing a flag that simply does
this internally rather than needing to copy each slot separately.

This works with IORING_REGISTER_FILES2 as the flag is set in struct
io_uring_rsrc_register, and is only allow when the type is
IORING_RSRC_FILE as this doesn't make sense for registered buffers.

Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 15 ++++++++++++---
 include/uapi/linux/io_uring.h |  8 +++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b75a49d3831b..362189819898 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9111,12 +9111,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 		struct io_fixed_file *file_slot;
 
-		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto fail;
 		}
 		/* allow sparse sets */
-		if (fd == -1) {
+		if (!fds || fd == -1) {
 			ret = -EINVAL;
 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 				goto fail;
@@ -11759,14 +11759,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	memset(&rr, 0, sizeof(rr));
 	if (copy_from_user(&rr, arg, size))
 		return -EFAULT;
-	if (!rr.nr || rr.resv || rr.resv2)
+	if (!rr.nr || rr.resv2)
+		return -EINVAL;
+	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
 		return -EINVAL;
 
 	switch (type) {
 	case IORING_RSRC_FILE:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+			break;
 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 					     rr.nr, u64_to_user_ptr(rr.tags));
 	case IORING_RSRC_BUFFER:
+		if (rr.flags & IORING_RSRC_REGISTER_SPARSE)
+			break;
 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
 					       rr.nr, u64_to_user_ptr(rr.tags));
 	}
@@ -11935,6 +11941,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_buffers_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES:
+		ret = -EFAULT;
+		if (!arg)
+			break;
 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_FILES:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b7f02a55032a..36ec43dc7bf9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -396,9 +396,15 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+/*
+ * Register a fully sparse file space, rather than pass in an array of all
+ * -1 file descriptors.
+ */
+#define IORING_RSRC_REGISTER_SPARSE	(1U << 0)
+
 struct io_uring_rsrc_register {
 	__u32 nr;
-	__u32 resv;
+	__u32 flags;
 	__u64 resv2;
 	__aligned_u64 data;
 	__aligned_u64 tags;
-- 
cgit v1.2.3


From b1f9e876862d8f7176299ec4fb2108bc1045cbc8 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 12 May 2022 20:22:56 -0700
Subject: mm/uffd: enable write protection for shmem & hugetlbfs

We've had all the necessary changes ready for both shmem and hugetlbfs.
Turn on all the shmem/hugetlbfs switches for userfaultfd-wp.

We can expand UFFD_API_RANGE_IOCTLS_BASIC with _UFFDIO_WRITEPROTECT too
because all existing types now support write protection mode.

Since vma_can_userfault() will be used elsewhere, move into userfaultfd_k.h.

Link: https://lkml.kernel.org/r/20220405014926.15101-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 21 +++------------------
 include/linux/userfaultfd_k.h    | 20 ++++++++++++++++++++
 include/uapi/linux/userfaultfd.h | 10 ++++++++--
 mm/userfaultfd.c                 |  9 +++------
 4 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 78b68e0f9774..e943370107d0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1258,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags)
-{
-	/* FIXME: add WP support to hugetlbfs and shmem */
-	if (vm_flags & VM_UFFD_WP) {
-		if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
-			return false;
-	}
-
-	if (vm_flags & VM_UFFD_MINOR) {
-		if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
-			return false;
-	}
-
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
-	       vma_is_shmem(vma);
-}
-
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 				unsigned long arg)
 {
@@ -1956,6 +1938,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #endif
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+#endif
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e7afcdfd4b46..732b522bacb7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -18,6 +18,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <asm-generic/pgtable_uffd.h>
+#include <linux/hugetlb_inline.h>
 
 /* The set of all possible UFFD-related VM flags. */
 #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
@@ -140,6 +141,25 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+				     unsigned long vm_flags)
+{
+	if (vm_flags & VM_UFFD_MINOR)
+		return is_vm_hugetlb_page(vma) || vma_is_shmem(vma);
+
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+	/*
+	 * If user requested uffd-wp but not enabled pte markers for
+	 * uffd-wp, then shmem & hugetlbfs are not supported but only
+	 * anonymous.
+	 */
+	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
+		return false;
+#endif
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+	    vma_is_shmem(vma);
+}
+
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index ef739054cb1c..7d32b1e797fb 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -33,7 +33,8 @@
 			   UFFD_FEATURE_THREAD_ID |		\
 			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
 			   UFFD_FEATURE_MINOR_SHMEM |		\
-			   UFFD_FEATURE_EXACT_ADDRESS)
+			   UFFD_FEATURE_EXACT_ADDRESS |		\
+			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -47,7 +48,8 @@
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
-	 (__u64)1 << _UFFDIO_CONTINUE)
+	 (__u64)1 << _UFFDIO_CONTINUE |		\
+	 (__u64)1 << _UFFDIO_WRITEPROTECT)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -194,6 +196,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
 	 * faults would be provided and the offset within the page would not be
 	 * masked.
+	 *
+	 * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
+	 * write-protection mode is supported on both shmem and hugetlbfs.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -207,6 +212,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
 #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
 #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
+#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 01edc18902c5..4f4892a5f767 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -732,15 +732,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 
 	err = -ENOENT;
 	dst_vma = find_dst_vma(dst_mm, start, len);
-	/*
-	 * Make sure the vma is not shared, that the dst range is
-	 * both valid and fully within a single existing vma.
-	 */
-	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+
+	if (!dst_vma)
 		goto out_unlock;
 	if (!userfaultfd_wp(dst_vma))
 		goto out_unlock;
-	if (!vma_is_anonymous(dst_vma))
+	if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
 		goto out_unlock;
 
 	if (is_vm_hugetlb_page(dst_vma)) {
-- 
cgit v1.2.3


From 390ed29b5e425ba00da2b6113b74a14949f71b02 Mon Sep 17 00:00:00 2001
From: Hao Xu <howeyxu@tencent.com>
Date: Sat, 14 May 2022 22:20:43 +0800
Subject: io_uring: add IORING_ACCEPT_MULTISHOT for accept

add an accept_flag IORING_ACCEPT_MULTISHOT for accept, which is to
support multishot.

Signed-off-by: Hao Xu <howeyxu@tencent.com>
Link: https://lore.kernel.org/r/20220514142046.58072-2-haoxu.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 36ec43dc7bf9..15f821af9242 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -232,6 +232,11 @@ enum {
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 
+/*
+ * accept flags stored in sqe->ioprio
+ */
+#define IORING_ACCEPT_MULTISHOT	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
-- 
cgit v1.2.3


From 8fa10ee183c3a1ecb53e81c95895ed5bc2a5530a Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 15 May 2022 21:58:31 +0100
Subject: cdrom: mark CDROMGETSPINDOWN/CDROMSETSPINDOWN obsolete

These were only implemented by the IDE CD driver, which has since
been removed.  Given that nobody is likely to create new CD/DVD
hardware (and associated drivers) we can mark these appropriately.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Phillip Potter <phil@philpotter.co.uk>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Link: https://lore.kernel.org/all/20220427132436.12795-3-paul.gortmaker@windriver.com
Signed-off-by: Phillip Potter <phil@philpotter.co.uk>
Link: https://lore.kernel.org/r/20220515205833.944139-4-phil@philpotter.co.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/userspace-api/ioctl/cdrom.rst | 6 ++++++
 include/uapi/linux/cdrom.h                  | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/ioctl/cdrom.rst b/Documentation/userspace-api/ioctl/cdrom.rst
index 682948fc88a3..2ad91dbebd7c 100644
--- a/Documentation/userspace-api/ioctl/cdrom.rst
+++ b/Documentation/userspace-api/ioctl/cdrom.rst
@@ -718,6 +718,9 @@ CDROMPLAYBLK
 
 
 CDROMGETSPINDOWN
+	Obsolete, was ide-cd only
+
+
 	usage::
 
 	  char spindown;
@@ -736,6 +739,9 @@ CDROMGETSPINDOWN
 
 
 CDROMSETSPINDOWN
+	Obsolete, was ide-cd only
+
+
 	usage::
 
 	  char spindown
diff --git a/include/uapi/linux/cdrom.h b/include/uapi/linux/cdrom.h
index 804ff8d98f71..011e594e4a0d 100644
--- a/include/uapi/linux/cdrom.h
+++ b/include/uapi/linux/cdrom.h
@@ -103,7 +103,7 @@
 #define CDROMREADALL		0x5318	/* read all 2646 bytes */
 
 /* 
- * These ioctls are (now) only in ide-cd.c for controlling 
+ * These ioctls were only in (now removed) ide-cd.c for controlling
  * drive spindown time.  They should be implemented in the
  * Uniform driver, via generic packet commands, GPCMD_MODE_SELECT_10,
  * GPCMD_MODE_SENSE_10 and the GPMODE_POWER_PAGE...
-- 
cgit v1.2.3


From 89527be8d8d672773eeaec910118a6e84fb597e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:33:56 -0700
Subject: net: add IFLA_TSO_{MAX_SIZE|SEGS} attributes

New netlink attributes IFLA_TSO_MAX_SIZE and IFLA_TSO_MAX_SEGS
are used to report to user-space the device TSO limits.

ip -d link sh dev eth1
...
   tso_max_size 65536 tso_max_segs 65535

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h       | 2 ++
 net/core/rtnetlink.c               | 6 ++++++
 tools/include/uapi/linux/if_link.h | 2 ++
 3 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d1e600816b82..5f58dcfe2787 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -368,6 +368,8 @@ enum {
 	IFLA_PARENT_DEV_NAME,
 	IFLA_PARENT_DEV_BUS_NAME,
 	IFLA_GRO_MAX_SIZE,
+	IFLA_TSO_MAX_SIZE,
+	IFLA_TSO_MAX_SEGS,
 
 	__IFLA_MAX
 };
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bdc891326102..f35cc21298ac 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1064,6 +1064,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
 	       + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
+	       + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
+	       + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1769,6 +1771,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
 	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
 	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
+	    nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) ||
+	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
@@ -1922,6 +1926,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_NEW_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
 	[IFLA_PARENT_DEV_NAME]	= { .type = NLA_NUL_STRING },
 	[IFLA_GRO_MAX_SIZE]	= { .type = NLA_U32 },
+	[IFLA_TSO_MAX_SIZE]	= { .type = NLA_REJECT },
+	[IFLA_TSO_MAX_SEGS]	= { .type = NLA_REJECT },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index e1ba2d51b717..b339bf2196ca 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -348,6 +348,8 @@ enum {
 	IFLA_PARENT_DEV_NAME,
 	IFLA_PARENT_DEV_BUS_NAME,
 	IFLA_GRO_MAX_SIZE,
+	IFLA_TSO_MAX_SIZE,
+	IFLA_TSO_MAX_SEGS,
 
 	__IFLA_MAX
 };
-- 
cgit v1.2.3


From f04fbcc64e4be16185151f9fca44ea1b3d074bd0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 20 Apr 2022 16:08:27 +0800
Subject: btrfs: move definition of btrfs_raid_types to volumes.h

It's only internally used as another way to represent btrfs profiles,
it's not exposed through any on-disk format, in fact this
btrfs_raid_types is diverted from the on-disk format values.

Furthermore, since it's internal structure, its definition can change in
the future.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.h           |  2 ++
 fs/btrfs/volumes.h              | 13 +++++++++++++
 include/uapi/linux/btrfs_tree.h | 13 -------------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a803e29bd781..c096695598c1 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,8 @@
 #ifndef BTRFS_SPACE_INFO_H
 #define BTRFS_SPACE_INFO_H
 
+#include "volumes.h"
+
 struct btrfs_space_info {
 	spinlock_t lock;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 197877e684df..7b82aae89454 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,6 +17,19 @@ extern struct mutex uuid_mutex;
 
 #define BTRFS_STRIPE_LEN	SZ_64K
 
+enum btrfs_raid_types {
+	BTRFS_RAID_RAID10,
+	BTRFS_RAID_RAID1,
+	BTRFS_RAID_DUP,
+	BTRFS_RAID_RAID0,
+	BTRFS_RAID_SINGLE,
+	BTRFS_RAID_RAID5,
+	BTRFS_RAID_RAID6,
+	BTRFS_RAID_RAID1C3,
+	BTRFS_RAID_RAID1C4,
+	BTRFS_NR_RAID_TYPES
+};
+
 struct btrfs_io_geometry {
 	/* remaining bytes before crossing a stripe */
 	u64 len;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index b069752a8ecf..d4117152d907 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -880,19 +880,6 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RESERVED	(BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
 					 BTRFS_SPACE_INFO_GLOBAL_RSV)
 
-enum btrfs_raid_types {
-	BTRFS_RAID_RAID10,
-	BTRFS_RAID_RAID1,
-	BTRFS_RAID_DUP,
-	BTRFS_RAID_RAID0,
-	BTRFS_RAID_SINGLE,
-	BTRFS_RAID_RAID5,
-	BTRFS_RAID_RAID6,
-	BTRFS_RAID_RAID1C3,
-	BTRFS_RAID_RAID1C4,
-	BTRFS_NR_RAID_TYPES
-};
-
 #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
 					 BTRFS_BLOCK_GROUP_SYSTEM |  \
 					 BTRFS_BLOCK_GROUP_METADATA)
-- 
cgit v1.2.3


From 1c05bb947f6464756174830b778aabf8f9d6ed0e Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Mon, 16 May 2022 12:12:02 +0200
Subject: include/uapi/linux/vfio.h: Fix trivial typo - _IORW should be _IOWR
 instead

There is no macro called _IORW, so use _IOWR in the comment instead.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20220516101202.88373-1-thuth@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fea86061b44e..733a1cddde30 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -643,7 +643,7 @@ enum {
 };
 
 /**
- * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
+ * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12,
  *					      struct vfio_pci_hot_reset_info)
  *
  * Return: 0 on success, -errno on failure:
@@ -770,7 +770,7 @@ struct vfio_device_ioeventfd {
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 
 /**
- * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17,
+ * VFIO_DEVICE_FEATURE - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
  *			       struct vfio_device_feature)
  *
  * Get, set, or probe feature data of the device.  The feature is selected
-- 
cgit v1.2.3


From 9f39d36530e5678d092d53c5c2c60d82b4dcc169 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sat, 7 May 2022 13:55:58 +0200
Subject: can: isotp: add support for transmission without flow control

Usually the ISO 15765-2 protocol is a point-to-point protocol to transfer
segmented PDUs to a dedicated receiver. This receiver sends a flow control
message to specify protocol options and timings (e.g. block size / STmin).

The so called functional addressing communication allows a 1:N
communication but is limited to a single frame length.

This new CAN_ISOTP_CF_BROADCAST allows an unconfirmed 1:N communication
with PDU length that would not fit into a single frame. This feature is
not covered by the ISO 15765-2 standard.

Link: https://lore.kernel.org/all/20220507115558.19065-1-socketcan@hartkopp.net
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/isotp.h |  25 ++++++-----
 net/can/isotp.c                | 100 ++++++++++++++++++++++++++++++++---------
 2 files changed, 92 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h
index 590f8aea2b6d..439c982f7e81 100644
--- a/include/uapi/linux/can/isotp.h
+++ b/include/uapi/linux/can/isotp.h
@@ -124,18 +124,19 @@ struct can_isotp_ll_options {
 
 /* flags for isotp behaviour */
 
-#define CAN_ISOTP_LISTEN_MODE	0x001	/* listen only (do not send FC) */
-#define CAN_ISOTP_EXTEND_ADDR	0x002	/* enable extended addressing */
-#define CAN_ISOTP_TX_PADDING	0x004	/* enable CAN frame padding tx path */
-#define CAN_ISOTP_RX_PADDING	0x008	/* enable CAN frame padding rx path */
-#define CAN_ISOTP_CHK_PAD_LEN	0x010	/* check received CAN frame padding */
-#define CAN_ISOTP_CHK_PAD_DATA	0x020	/* check received CAN frame padding */
-#define CAN_ISOTP_HALF_DUPLEX	0x040	/* half duplex error state handling */
-#define CAN_ISOTP_FORCE_TXSTMIN	0x080	/* ignore stmin from received FC */
-#define CAN_ISOTP_FORCE_RXSTMIN	0x100	/* ignore CFs depending on rx stmin */
-#define CAN_ISOTP_RX_EXT_ADDR	0x200	/* different rx extended addressing */
-#define CAN_ISOTP_WAIT_TX_DONE	0x400	/* wait for tx completion */
-#define CAN_ISOTP_SF_BROADCAST	0x800	/* 1-to-N functional addressing */
+#define CAN_ISOTP_LISTEN_MODE	0x0001	/* listen only (do not send FC) */
+#define CAN_ISOTP_EXTEND_ADDR	0x0002	/* enable extended addressing */
+#define CAN_ISOTP_TX_PADDING	0x0004	/* enable CAN frame padding tx path */
+#define CAN_ISOTP_RX_PADDING	0x0008	/* enable CAN frame padding rx path */
+#define CAN_ISOTP_CHK_PAD_LEN	0x0010	/* check received CAN frame padding */
+#define CAN_ISOTP_CHK_PAD_DATA	0x0020	/* check received CAN frame padding */
+#define CAN_ISOTP_HALF_DUPLEX	0x0040	/* half duplex error state handling */
+#define CAN_ISOTP_FORCE_TXSTMIN	0x0080	/* ignore stmin from received FC */
+#define CAN_ISOTP_FORCE_RXSTMIN	0x0100	/* ignore CFs depending on rx stmin */
+#define CAN_ISOTP_RX_EXT_ADDR	0x0200	/* different rx extended addressing */
+#define CAN_ISOTP_WAIT_TX_DONE	0x0400	/* wait for tx completion */
+#define CAN_ISOTP_SF_BROADCAST	0x0800	/* 1-to-N functional addressing */
+#define CAN_ISOTP_CF_BROADCAST	0x1000	/* 1-to-N transmission w/o FC */
 
 /* protocol machine default values */
 
diff --git a/net/can/isotp.c b/net/can/isotp.c
index 35a1ae61744c..2caeeae8ec16 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -104,6 +104,7 @@ MODULE_ALIAS("can-proto-6");
 #define FC_CONTENT_SZ 3	/* flow control content size in byte (FS/BS/STmin) */
 
 #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA)
+#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST)
 
 /* Flow Status given in FC frame */
 #define ISOTP_FC_CTS 0		/* clear to send */
@@ -159,6 +160,23 @@ static inline struct isotp_sock *isotp_sk(const struct sock *sk)
 	return (struct isotp_sock *)sk;
 }
 
+static u32 isotp_bc_flags(struct isotp_sock *so)
+{
+	return so->opt.flags & ISOTP_ALL_BC_FLAGS;
+}
+
+static bool isotp_register_rxid(struct isotp_sock *so)
+{
+	/* no broadcast modes => register rx_id for FC frame reception */
+	return (isotp_bc_flags(so) == 0);
+}
+
+static bool isotp_register_txecho(struct isotp_sock *so)
+{
+	/* all modes but SF_BROADCAST register for tx echo skbs */
+	return (isotp_bc_flags(so) != CAN_ISOTP_SF_BROADCAST);
+}
+
 static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
 {
 	struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
@@ -803,7 +821,6 @@ static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so,
 		cf->data[i] = so->tx.buf[so->tx.idx++];
 
 	so->tx.sn = 1;
-	so->tx.state = ISOTP_WAIT_FIRST_FC;
 }
 
 static void isotp_rcv_echo(struct sk_buff *skb, void *data)
@@ -936,7 +953,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
 
 	/* does the given data fit into a single frame for SF_BROADCAST? */
-	if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) &&
+	if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) &&
 	    (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) {
 		err = -EINVAL;
 		goto err_out_drop;
@@ -1000,12 +1017,41 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		/* don't enable wait queue for a single frame transmission */
 		wait_tx_done = 0;
 	} else {
-		/* send first frame and wait for FC */
+		/* send first frame */
 
 		isotp_create_fframe(cf, so, ae);
 
-		/* start timeout for FC */
-		hrtimer_sec = 1;
+		if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) {
+			/* set timer for FC-less operation (STmin = 0) */
+			if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+				so->tx_gap = ktime_set(0, so->force_tx_stmin);
+			else
+				so->tx_gap = ktime_set(0, so->frame_txtime);
+
+			/* disable wait for FCs due to activated block size */
+			so->txfc.bs = 0;
+
+			/* cfecho should have been zero'ed by init */
+			if (so->cfecho)
+				pr_notice_once("can-isotp: no fc cfecho %08X\n",
+					       so->cfecho);
+
+			/* set consecutive frame echo tag */
+			so->cfecho = *(u32 *)cf->data;
+
+			/* switch directly to ISOTP_SENDING state */
+			so->tx.state = ISOTP_SENDING;
+
+			/* start timeout for unlikely lost echo skb */
+			hrtimer_sec = 2;
+		} else {
+			/* standard flow control check */
+			so->tx.state = ISOTP_WAIT_FIRST_FC;
+
+			/* start timeout for FC */
+			hrtimer_sec = 1;
+		}
+
 		hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
 			      HRTIMER_MODE_REL_SOFT);
 	}
@@ -1025,6 +1071,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		if (hrtimer_sec)
 			hrtimer_cancel(&so->txtimer);
 
+		/* reset consecutive frame echo tag */
+		so->cfecho = 0;
+
 		goto err_out_drop;
 	}
 
@@ -1120,15 +1169,17 @@ static int isotp_release(struct socket *sock)
 	lock_sock(sk);
 
 	/* remove current filters & unregister */
-	if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) {
+	if (so->bound && isotp_register_txecho(so)) {
 		if (so->ifindex) {
 			struct net_device *dev;
 
 			dev = dev_get_by_index(net, so->ifindex);
 			if (dev) {
-				can_rx_unregister(net, dev, so->rxid,
-						  SINGLE_MASK(so->rxid),
-						  isotp_rcv, sk);
+				if (isotp_register_rxid(so))
+					can_rx_unregister(net, dev, so->rxid,
+							  SINGLE_MASK(so->rxid),
+							  isotp_rcv, sk);
+
 				can_rx_unregister(net, dev, so->txid,
 						  SINGLE_MASK(so->txid),
 						  isotp_rcv_echo, sk);
@@ -1164,7 +1215,6 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 	canid_t tx_id, rx_id;
 	int err = 0;
 	int notify_enetdown = 0;
-	int do_rx_reg = 1;
 
 	if (len < ISOTP_MIN_NAMELEN)
 		return -EINVAL;
@@ -1192,12 +1242,8 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 		goto out;
 	}
 
-	/* do not register frame reception for functional addressing */
-	if (so->opt.flags & CAN_ISOTP_SF_BROADCAST)
-		do_rx_reg = 0;
-
-	/* do not validate rx address for functional addressing */
-	if (do_rx_reg && rx_id == tx_id) {
+	/* ensure different CAN IDs when the rx_id is to be registered */
+	if (isotp_register_rxid(so) && rx_id == tx_id) {
 		err = -EADDRNOTAVAIL;
 		goto out;
 	}
@@ -1222,10 +1268,11 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 
 	ifindex = dev->ifindex;
 
-	if (do_rx_reg) {
+	if (isotp_register_rxid(so))
 		can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id),
 				isotp_rcv, sk, "isotp", sk);
 
+	if (isotp_register_txecho(so)) {
 		/* no consecutive frame echo skb in flight */
 		so->cfecho = 0;
 
@@ -1294,6 +1341,15 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
 		if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
 			so->opt.rx_ext_address = so->opt.ext_address;
 
+		/* these broadcast flags are not allowed together */
+		if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) {
+			/* CAN_ISOTP_SF_BROADCAST is prioritized */
+			so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST;
+
+			/* give user feedback on wrong config attempt */
+			ret = -EINVAL;
+		}
+
 		/* check for frame_txtime changes (0 => no changes) */
 		if (so->opt.frame_txtime) {
 			if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
@@ -1444,10 +1500,12 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg,
 	case NETDEV_UNREGISTER:
 		lock_sock(sk);
 		/* remove current filters & unregister */
-		if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) {
-			can_rx_unregister(dev_net(dev), dev, so->rxid,
-					  SINGLE_MASK(so->rxid),
-					  isotp_rcv, sk);
+		if (so->bound && isotp_register_txecho(so)) {
+			if (isotp_register_rxid(so))
+				can_rx_unregister(dev_net(dev), dev, so->rxid,
+						  SINGLE_MASK(so->rxid),
+						  isotp_rcv, sk);
+
 			can_rx_unregister(dev_net(dev), dev, so->txid,
 					  SINGLE_MASK(so->txid),
 					  isotp_rcv_echo, sk);
-- 
cgit v1.2.3


From b87f5e25b2f9deb503a61c6957c7b1680d91cfea Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Fri, 6 May 2022 01:03:48 +0200
Subject: media: uapi: Add IPU3 packed Y10 format

Some platforms with an Intel IPU3 have an IR sensor producing 10 bit
greyscale format data that is transmitted over a CSI-2 bus to a CIO2
device - this packs the data into 32 bytes per 25 pixels. Add an entry
to the uAPI header defining that format.

Signed-off-by: Daniel Scally <djrscally@gmail.com>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst | 14 +++++++++++++-
 drivers/media/v4l2-core/v4l2-ioctl.c                      |  1 +
 include/uapi/linux/videodev2.h                            |  3 ++-
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
index 8ebd58c3588f..6a387f9df3ba 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
@@ -48,6 +48,17 @@ are often referred to as greyscale formats.
       - ...
       - ...
 
+    * .. _V4L2-PIX-FMT-IPU3-Y10:
+
+      - ``V4L2_PIX_FMT_IPU3_Y10``
+      - 'ip3y'
+
+      - Y'\ :sub:`0`\ [7:0]
+      - Y'\ :sub:`1`\ [5:0] Y'\ :sub:`0`\ [9:8]
+      - Y'\ :sub:`2`\ [3:0] Y'\ :sub:`1`\ [9:6]
+      - Y'\ :sub:`3`\ [1:0] Y'\ :sub:`2`\ [9:4]
+      - Y'\ :sub:`3`\ [9:2]
+
     * .. _V4L2-PIX-FMT-Y10:
 
       - ``V4L2_PIX_FMT_Y10``
@@ -133,4 +144,5 @@ are often referred to as greyscale formats.
 
     For the Y16 and Y16_BE formats, the actual sampling precision may be lower
     than 16 bits. For example, 10 bits per pixel uses values in the range 0 to
-    1023.
+    1023. For the IPU3_Y10 format 25 pixels are packed into 32 bytes, which
+    leaves the 6 most significant bits of the last byte padded with 0.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index e2636539c9db..21470de62d72 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1269,6 +1269,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y16_BE:	descr = "16-bit Greyscale BE"; break;
 	case V4L2_PIX_FMT_Y10BPACK:	descr = "10-bit Greyscale (Packed)"; break;
 	case V4L2_PIX_FMT_Y10P:		descr = "10-bit Greyscale (MIPI Packed)"; break;
+	case V4L2_PIX_FMT_IPU3_Y10:	descr = "10-bit greyscale (IPU3 Packed)"; break;
 	case V4L2_PIX_FMT_Y8I:		descr = "Interleaved 8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 6d465dc443b7..343b95107fce 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -569,6 +569,7 @@ struct v4l2_pix_format {
 /* Grey bit-packed formats */
 #define V4L2_PIX_FMT_Y10BPACK    v4l2_fourcc('Y', '1', '0', 'B') /* 10  Greyscale bit-packed */
 #define V4L2_PIX_FMT_Y10P    v4l2_fourcc('Y', '1', '0', 'P') /* 10  Greyscale, MIPI RAW10 packed */
+#define V4L2_PIX_FMT_IPU3_Y10		v4l2_fourcc('i', 'p', '3', 'y') /* IPU3 packed 10-bit greyscale */
 
 /* Palette formats */
 #define V4L2_PIX_FMT_PAL8    v4l2_fourcc('P', 'A', 'L', '8') /*  8  8-bit palette */
@@ -749,7 +750,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_QC08C    v4l2_fourcc('Q', '0', '8', 'C') /* Qualcomm 8-bit compressed */
 #define V4L2_PIX_FMT_QC10C    v4l2_fourcc('Q', '1', '0', 'C') /* Qualcomm 10-bit compressed */
 
-/* 10bit raw bayer packed, 32 bytes for every 25 pixels, last LSB 6 bits unused */
+/* 10bit raw packed, 32 bytes for every 25 pixels, last LSB 6 bits unused */
 #define V4L2_PIX_FMT_IPU3_SBGGR10	v4l2_fourcc('i', 'p', '3', 'b') /* IPU3 packed 10-bit BGGR bayer */
 #define V4L2_PIX_FMT_IPU3_SGBRG10	v4l2_fourcc('i', 'p', '3', 'g') /* IPU3 packed 10-bit GBRG bayer */
 #define V4L2_PIX_FMT_IPU3_SGRBG10	v4l2_fourcc('i', 'p', '3', 'G') /* IPU3 packed 10-bit GRBG bayer */
-- 
cgit v1.2.3


From 7c3e9fcad9c7d8bb5d69a576044fb16b1d2e8a01 Mon Sep 17 00:00:00 2001
From: Jérôme Pouiller <jerome.pouiller@silabs.com>
Date: Tue, 17 May 2022 09:27:08 +0200
Subject: dma-buf: fix use of DMA_BUF_SET_NAME_{A,B} in userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The typedefs u32 and u64 are not available in userspace. Thus user get
an error he try to use DMA_BUF_SET_NAME_A or DMA_BUF_SET_NAME_B:

    $ gcc -Wall   -c -MMD -c -o ioctls_list.o ioctls_list.c
    In file included from /usr/include/x86_64-linux-gnu/asm/ioctl.h:1,
                     from /usr/include/linux/ioctl.h:5,
                     from /usr/include/asm-generic/ioctls.h:5,
                     from ioctls_list.c:11:
    ioctls_list.c:463:29: error: ‘u32’ undeclared here (not in a function)
      463 |     { "DMA_BUF_SET_NAME_A", DMA_BUF_SET_NAME_A, -1, -1 }, // linux/dma-buf.h
          |                             ^~~~~~~~~~~~~~~~~~
    ioctls_list.c:464:29: error: ‘u64’ undeclared here (not in a function)
      464 |     { "DMA_BUF_SET_NAME_B", DMA_BUF_SET_NAME_B, -1, -1 }, // linux/dma-buf.h
          |                             ^~~~~~~~~~~~~~~~~~

The issue was initially reported here[1].

[1]: https://github.com/jerome-pouiller/ioctl/pull/14

Signed-off-by: Jérôme Pouiller <jerome.pouiller@silabs.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Fixes: a5bff92eaac4 ("dma-buf: Fix SET_NAME ioctl uapi")
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20220517072708.245265-1-Jerome.Pouiller@silabs.com
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/uapi/linux/dma-buf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index 8e4a2ca0bcbf..b1523cb8ab30 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -92,7 +92,7 @@ struct dma_buf_sync {
  * between them in actual uapi, they're just different numbers.
  */
 #define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
-#define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, u32)
-#define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, u64)
+#define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, __u32)
+#define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, __u64)
 
 #endif
-- 
cgit v1.2.3


From c8383054506c77b814489c09877b5db83fd4abf2 Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Mon, 25 Apr 2022 20:21:24 +0800
Subject: cachefiles: notify the user daemon when looking up cookie

Fscache/CacheFiles used to serve as a local cache for a remote
networking fs. A new on-demand read mode will be introduced for
CacheFiles, which can boost the scenario where on-demand read semantics
are needed, e.g. container image distribution.

The essential difference between these two modes is seen when a cache
miss occurs: In the original mode, the netfs will fetch the data from
the remote server and then write it to the cache file; in on-demand
read mode, fetching the data and writing it into the cache is delegated
to a user daemon.

As the first step, notify the user daemon when looking up cookie. In
this case, an anonymous fd is sent to the user daemon, through which the
user daemon can write the fetched data to the cache file. Since the user
daemon may move the anonymous fd around, e.g. through dup(), an object
ID uniquely identifying the cache file is also attached.

Also add one advisory flag (FSCACHE_ADV_WANT_CACHE_SIZE) suggesting that
the cache file size shall be retrieved at runtime. This helps the
scenario where one cache file contains multiple netfs files, e.g. for
the purpose of deduplication. In this case, netfs itself has no idea the
size of the cache file, whilst the user daemon should give the hint on
it.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220509074028.74954-3-jefflexu@linux.alibaba.com
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/cachefiles/Kconfig             |  12 ++
 fs/cachefiles/Makefile            |   1 +
 fs/cachefiles/daemon.c            |  81 ++++++--
 fs/cachefiles/internal.h          |  51 +++++
 fs/cachefiles/namei.c             |  16 +-
 fs/cachefiles/ondemand.c          | 378 ++++++++++++++++++++++++++++++++++++++
 include/linux/fscache.h           |   1 +
 include/trace/events/cachefiles.h |   2 +
 include/uapi/linux/cachefiles.h   |  50 +++++
 9 files changed, 577 insertions(+), 15 deletions(-)
 create mode 100644 fs/cachefiles/ondemand.c
 create mode 100644 include/uapi/linux/cachefiles.h

(limited to 'include/uapi/linux')

diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 719faeeda168..8df715640a48 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -26,3 +26,15 @@ config CACHEFILES_ERROR_INJECTION
 	help
 	  This permits error injection to be enabled in cachefiles whilst a
 	  cache is in service.
+
+config CACHEFILES_ONDEMAND
+	bool "Support for on-demand read"
+	depends on CACHEFILES
+	default n
+	help
+	  This permits userspace to enable the cachefiles on-demand read mode.
+	  In this mode, when a cache miss occurs, responsibility for fetching
+	  the data lies with the cachefiles backend instead of with the netfs
+	  and is delegated to userspace.
+
+	  If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 16d811f1a2fa..c37a7a9af10b 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -16,5 +16,6 @@ cachefiles-y := \
 	xattr.o
 
 cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
 
 obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 7ac04ee2c0a0..d5417da7f792 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -75,6 +75,9 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
 	{ "inuse",	cachefiles_daemon_inuse		},
 	{ "secctx",	cachefiles_daemon_secctx	},
 	{ "tag",	cachefiles_daemon_tag		},
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+	{ "copen",	cachefiles_ondemand_copen	},
+#endif
 	{ "",		NULL				}
 };
 
@@ -108,6 +111,8 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	INIT_LIST_HEAD(&cache->volumes);
 	INIT_LIST_HEAD(&cache->object_list);
 	spin_lock_init(&cache->object_list_lock);
+	xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC);
+	xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1);
 
 	/* set default caching limits
 	 * - limit at 1% free space and/or free files
@@ -126,6 +131,39 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+	struct xarray *xa = &cache->reqs;
+	struct cachefiles_req *req;
+	unsigned long index;
+
+	/*
+	 * Make sure the following two operations won't be reordered.
+	 *   1) set CACHEFILES_DEAD bit
+	 *   2) flush requests in the xarray
+	 * Otherwise the request may be enqueued after xarray has been
+	 * flushed, leaving the orphan request never being completed.
+	 *
+	 * CPU 1			CPU 2
+	 * =====			=====
+	 * flush requests in the xarray
+	 *				test CACHEFILES_DEAD bit
+	 *				enqueue the request
+	 * set CACHEFILES_DEAD bit
+	 */
+	smp_mb();
+
+	xa_lock(xa);
+	xa_for_each(xa, index, req) {
+		req->error = -EIO;
+		complete(&req->done);
+	}
+	xa_unlock(xa);
+
+	xa_destroy(&cache->reqs);
+	xa_destroy(&cache->ondemand_ids);
+}
+
 /*
  * Release a cache.
  */
@@ -139,6 +177,8 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
 
 	set_bit(CACHEFILES_DEAD, &cache->flags);
 
+	if (cachefiles_in_ondemand_mode(cache))
+		cachefiles_flush_reqs(cache);
 	cachefiles_daemon_unbind(cache);
 
 	/* clean up the control file interface */
@@ -152,23 +192,14 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-/*
- * Read the cache state.
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
-				      size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+					 char __user *_buffer, size_t buflen)
 {
-	struct cachefiles_cache *cache = file->private_data;
 	unsigned long long b_released;
 	unsigned f_released;
 	char buffer[256];
 	int n;
 
-	//_enter(",,%zu,", buflen);
-
-	if (!test_bit(CACHEFILES_READY, &cache->flags))
-		return 0;
-
 	/* check how much space the cache has */
 	cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
 
@@ -206,6 +237,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
 	return n;
 }
 
+/*
+ * Read the cache state.
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+				      size_t buflen, loff_t *pos)
+{
+	struct cachefiles_cache *cache = file->private_data;
+
+	//_enter(",,%zu,", buflen);
+
+	if (!test_bit(CACHEFILES_READY, &cache->flags))
+		return 0;
+
+	if (cachefiles_in_ondemand_mode(cache))
+		return cachefiles_ondemand_daemon_read(cache, _buffer, buflen);
+	else
+		return cachefiles_do_daemon_read(cache, _buffer, buflen);
+}
+
 /*
  * Take a command from cachefilesd, parse it and act on it.
  */
@@ -297,8 +347,13 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
 	poll_wait(file, &cache->daemon_pollwq, poll);
 	mask = 0;
 
-	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
-		mask |= EPOLLIN;
+	if (cachefiles_in_ondemand_mode(cache)) {
+		if (!xa_empty(&cache->reqs))
+			mask |= EPOLLIN;
+	} else {
+		if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+			mask |= EPOLLIN;
+	}
 
 	if (test_bit(CACHEFILES_CULLING, &cache->flags))
 		mask |= EPOLLOUT;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index e80673d0ab97..4f5150a96849 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -15,6 +15,8 @@
 #include <linux/fscache-cache.h>
 #include <linux/cred.h>
 #include <linux/security.h>
+#include <linux/xarray.h>
+#include <linux/cachefiles.h>
 
 #define CACHEFILES_DIO_BLOCK_SIZE 4096
 
@@ -58,8 +60,13 @@ struct cachefiles_object {
 	enum cachefiles_content		content_info:8;	/* Info about content presence */
 	unsigned long			flags;
 #define CACHEFILES_OBJECT_USING_TMPFILE	0		/* Have an unlinked tmpfile */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+	int				ondemand_id;
+#endif
 };
 
+#define CACHEFILES_ONDEMAND_ID_CLOSED	-1
+
 /*
  * Cache files cache definition
  */
@@ -98,11 +105,30 @@ struct cachefiles_cache {
 #define CACHEFILES_DEAD			1	/* T if cache dead */
 #define CACHEFILES_CULLING		2	/* T if cull engaged */
 #define CACHEFILES_STATE_CHANGED	3	/* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE	4	/* T if in on-demand read mode */
 	char				*rootdirname;	/* name of cache root directory */
 	char				*secctx;	/* LSM security context */
 	char				*tag;		/* cache binding tag */
+	struct xarray			reqs;		/* xarray of pending on-demand requests */
+	struct xarray			ondemand_ids;	/* xarray for ondemand_id allocation */
+	u32				ondemand_id_next;
 };
 
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+	return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+		test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
+
+struct cachefiles_req {
+	struct cachefiles_object *object;
+	struct completion done;
+	int error;
+	struct cachefiles_msg msg;
+};
+
+#define CACHEFILES_REQ_NEW	XA_MARK_1
+
 #include <trace/events/cachefiles.h>
 
 static inline
@@ -250,6 +276,31 @@ extern struct file *cachefiles_create_tmpfile(struct cachefiles_object *object);
 extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
 				      struct cachefiles_object *object);
 
+/*
+ * ondemand.c
+ */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+				     char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+
+#else
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	return 0;
+}
+#endif
+
 /*
  * security.c
  */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca9f3e4ec4b3..facf2ebe464b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -452,10 +452,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
 	struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
 	struct file *file;
 	struct path path;
-	uint64_t ni_size = object->cookie->object_size;
+	uint64_t ni_size;
 	long ret;
 
-	ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
 
 	cachefiles_begin_secure(cache, &saved_cred);
 
@@ -481,6 +480,15 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
 		goto out_dput;
 	}
 
+	ret = cachefiles_ondemand_init_object(object);
+	if (ret < 0) {
+		file = ERR_PTR(ret);
+		goto out_unuse;
+	}
+
+	ni_size = object->cookie->object_size;
+	ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
+
 	if (ni_size > 0) {
 		trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
 				       cachefiles_trunc_expand_tmpfile);
@@ -586,6 +594,10 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
 	}
 	_debug("file -> %pd positive", dentry);
 
+	ret = cachefiles_ondemand_init_object(object);
+	if (ret < 0)
+		goto error_fput;
+
 	ret = cachefiles_check_auxdata(object, file);
 	if (ret < 0)
 		goto check_failed;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 000000000000..64fc312b16d3
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/fdtable.h>
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+					  struct file *file)
+{
+	struct cachefiles_object *object = file->private_data;
+	struct cachefiles_cache *cache = object->volume->cache;
+	int object_id = object->ondemand_id;
+
+	object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+	xa_erase(&cache->ondemand_ids, object_id);
+	cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+	return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+						 struct iov_iter *iter)
+{
+	struct cachefiles_object *object = kiocb->ki_filp->private_data;
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct file *file = object->file;
+	size_t len = iter->count;
+	loff_t pos = kiocb->ki_pos;
+	const struct cred *saved_cred;
+	int ret;
+
+	if (!file)
+		return -ENOBUFS;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+	ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+	cachefiles_end_secure(cache, saved_cred);
+	if (ret < 0)
+		return ret;
+
+	ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
+					    int whence)
+{
+	struct cachefiles_object *object = filp->private_data;
+	struct file *file = object->file;
+
+	if (!file)
+		return -ENOBUFS;
+
+	return vfs_llseek(file, pos, whence);
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+	.owner		= THIS_MODULE,
+	.release	= cachefiles_ondemand_fd_release,
+	.write_iter	= cachefiles_ondemand_fd_write_iter,
+	.llseek		= cachefiles_ondemand_fd_llseek,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ *   <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+	struct cachefiles_req *req;
+	struct fscache_cookie *cookie;
+	char *pid, *psize;
+	unsigned long id;
+	long size;
+	int ret;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	if (!*args) {
+		pr_err("Empty id specified\n");
+		return -EINVAL;
+	}
+
+	pid = args;
+	psize = strchr(args, ',');
+	if (!psize) {
+		pr_err("Cache size is not specified\n");
+		return -EINVAL;
+	}
+
+	*psize = 0;
+	psize++;
+
+	ret = kstrtoul(pid, 0, &id);
+	if (ret)
+		return ret;
+
+	req = xa_erase(&cache->reqs, id);
+	if (!req)
+		return -EINVAL;
+
+	/* fail OPEN request if copen format is invalid */
+	ret = kstrtol(psize, 0, &size);
+	if (ret) {
+		req->error = ret;
+		goto out;
+	}
+
+	/* fail OPEN request if daemon reports an error */
+	if (size < 0) {
+		if (!IS_ERR_VALUE(size))
+			size = -EINVAL;
+		req->error = size;
+		goto out;
+	}
+
+	cookie = req->object->cookie;
+	cookie->object_size = size;
+	if (size)
+		clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+	else
+		set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+
+out:
+	complete(&req->done);
+	return ret;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct cachefiles_open *load;
+	struct file *file;
+	u32 object_id;
+	int ret, fd;
+
+	object = cachefiles_grab_object(req->object,
+			cachefiles_obj_get_ondemand_fd);
+	cache = object->volume->cache;
+
+	ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL,
+			      XA_LIMIT(1, INT_MAX),
+			      &cache->ondemand_id_next, GFP_KERNEL);
+	if (ret < 0)
+		goto err;
+
+	fd = get_unused_fd_flags(O_WRONLY);
+	if (fd < 0) {
+		ret = fd;
+		goto err_free_id;
+	}
+
+	file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
+				  object, O_WRONLY);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err_put_fd;
+	}
+
+	file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+	fd_install(fd, file);
+
+	load = (void *)req->msg.data;
+	load->fd = fd;
+	req->msg.object_id = object_id;
+	object->ondemand_id = object_id;
+	return 0;
+
+err_put_fd:
+	put_unused_fd(fd);
+err_free_id:
+	xa_erase(&cache->ondemand_ids, object_id);
+err:
+	cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+	return ret;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+					char __user *_buffer, size_t buflen)
+{
+	struct cachefiles_req *req;
+	struct cachefiles_msg *msg;
+	unsigned long id = 0;
+	size_t n;
+	int ret = 0;
+	XA_STATE(xas, &cache->reqs, 0);
+
+	/*
+	 * Search for a request that has not ever been processed, to prevent
+	 * requests from being processed repeatedly.
+	 */
+	xa_lock(&cache->reqs);
+	req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+	if (!req) {
+		xa_unlock(&cache->reqs);
+		return 0;
+	}
+
+	msg = &req->msg;
+	n = msg->len;
+
+	if (n > buflen) {
+		xa_unlock(&cache->reqs);
+		return -EMSGSIZE;
+	}
+
+	xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+	xa_unlock(&cache->reqs);
+
+	id = xas.xa_index;
+	msg->msg_id = id;
+
+	if (msg->opcode == CACHEFILES_OP_OPEN) {
+		ret = cachefiles_ondemand_get_fd(req);
+		if (ret)
+			goto error;
+	}
+
+	if (copy_to_user(_buffer, msg, n) != 0) {
+		ret = -EFAULT;
+		goto err_put_fd;
+	}
+
+	return n;
+
+err_put_fd:
+	if (msg->opcode == CACHEFILES_OP_OPEN)
+		close_fd(((struct cachefiles_open *)msg->data)->fd);
+error:
+	xa_erase(&cache->reqs, id);
+	req->error = ret;
+	complete(&req->done);
+	return ret;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+					enum cachefiles_opcode opcode,
+					size_t data_len,
+					init_req_fn init_req,
+					void *private)
+{
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct cachefiles_req *req;
+	XA_STATE(xas, &cache->reqs, 0);
+	int ret;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return 0;
+
+	if (test_bit(CACHEFILES_DEAD, &cache->flags))
+		return -EIO;
+
+	req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->object = object;
+	init_completion(&req->done);
+	req->msg.opcode = opcode;
+	req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+	ret = init_req(req, private);
+	if (ret)
+		goto out;
+
+	do {
+		/*
+		 * Stop enqueuing the request when daemon is dying. The
+		 * following two operations need to be atomic as a whole.
+		 *   1) check cache state, and
+		 *   2) enqueue request if cache is alive.
+		 * Otherwise the request may be enqueued after xarray has been
+		 * flushed, leaving the orphan request never being completed.
+		 *
+		 * CPU 1			CPU 2
+		 * =====			=====
+		 *				test CACHEFILES_DEAD bit
+		 * set CACHEFILES_DEAD bit
+		 * flush requests in the xarray
+		 *				enqueue the request
+		 */
+		xas_lock(&xas);
+
+		if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+			xas_unlock(&xas);
+			ret = -EIO;
+			goto out;
+		}
+
+		/* coupled with the barrier in cachefiles_flush_reqs() */
+		smp_mb();
+
+		xas.xa_index = 0;
+		xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+		if (xas.xa_node == XAS_RESTART)
+			xas_set_err(&xas, -EBUSY);
+		xas_store(&xas, req);
+		xas_clear_mark(&xas, XA_FREE_MARK);
+		xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+		xas_unlock(&xas);
+	} while (xas_nomem(&xas, GFP_KERNEL));
+
+	ret = xas_error(&xas);
+	if (ret)
+		goto out;
+
+	wake_up_all(&cache->daemon_pollwq);
+	wait_for_completion(&req->done);
+	ret = req->error;
+out:
+	kfree(req);
+	return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_object *object = req->object;
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_volume *volume = object->volume->vcookie;
+	struct cachefiles_open *load = (void *)req->msg.data;
+	size_t volume_key_size, cookie_key_size;
+	void *volume_key, *cookie_key;
+
+	/*
+	 * Volume key is a NUL-terminated string. key[0] stores strlen() of the
+	 * string, followed by the content of the string (excluding '\0').
+	 */
+	volume_key_size = volume->key[0] + 1;
+	volume_key = volume->key + 1;
+
+	/* Cookie key is binary data, which is netfs specific. */
+	cookie_key_size = cookie->key_len;
+	cookie_key = fscache_get_key(cookie);
+
+	if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) {
+		pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n");
+		return -EINVAL;
+	}
+
+	load->volume_key_size = volume_key_size;
+	load->cookie_key_size = cookie_key_size;
+	memcpy(load->data, volume_key, volume_key_size);
+	memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+
+	return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_volume *volume = object->volume->vcookie;
+	size_t volume_key_size, cookie_key_size, data_len;
+
+	/*
+	 * CacheFiles will firstly check the cache file under the root cache
+	 * directory. If the coherency check failed, it will fallback to
+	 * creating a new tmpfile as the cache file. Reuse the previously
+	 * allocated object ID if any.
+	 */
+	if (object->ondemand_id > 0)
+		return 0;
+
+	volume_key_size = volume->key[0] + 1;
+	cookie_key_size = cookie->key_len;
+	data_len = sizeof(struct cachefiles_open) +
+		   volume_key_size + cookie_key_size;
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+			data_len, cachefiles_ondemand_init_open_req, NULL);
+}
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index e25539072463..72585c9729a2 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -39,6 +39,7 @@ struct fscache_cookie;
 #define FSCACHE_ADV_SINGLE_CHUNK	0x01 /* The object is a single chunk of data */
 #define FSCACHE_ADV_WRITE_CACHE		0x00 /* Do cache if written to locally */
 #define FSCACHE_ADV_WRITE_NOCACHE	0x02 /* Don't cache if written to locally */
+#define FSCACHE_ADV_WANT_CACHE_SIZE	0x04 /* Retrieve cache size at runtime */
 
 #define FSCACHE_INVAL_DIO_WRITE		0x01 /* Invalidate due to DIO write */
 
diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h
index 311c14a20e70..93df9391bd7f 100644
--- a/include/trace/events/cachefiles.h
+++ b/include/trace/events/cachefiles.h
@@ -31,6 +31,8 @@ enum cachefiles_obj_ref_trace {
 	cachefiles_obj_see_lookup_failed,
 	cachefiles_obj_see_withdraw_cookie,
 	cachefiles_obj_see_withdrawal,
+	cachefiles_obj_get_ondemand_fd,
+	cachefiles_obj_put_ondemand_fd,
 };
 
 enum fscache_why_object_killed {
diff --git a/include/uapi/linux/cachefiles.h b/include/uapi/linux/cachefiles.h
new file mode 100644
index 000000000000..521f2fe4fe9c
--- /dev/null
+++ b/include/uapi/linux/cachefiles.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_CACHEFILES_H
+#define _LINUX_CACHEFILES_H
+
+#include <linux/types.h>
+
+/*
+ * Fscache ensures that the maximum length of cookie key is 255. The volume key
+ * is controlled by netfs, and generally no bigger than 255.
+ */
+#define CACHEFILES_MSG_MAX_SIZE	1024
+
+enum cachefiles_opcode {
+	CACHEFILES_OP_OPEN,
+};
+
+/*
+ * Message Header
+ *
+ * @msg_id	a unique ID identifying this message
+ * @opcode	message type, CACHEFILE_OP_*
+ * @len		message length, including message header and following data
+ * @object_id	a unique ID identifying a cache file
+ * @data	message type specific payload
+ */
+struct cachefiles_msg {
+	__u32 msg_id;
+	__u32 opcode;
+	__u32 len;
+	__u32 object_id;
+	__u8  data[];
+};
+
+/*
+ * @data contains the volume_key followed directly by the cookie_key. volume_key
+ * is a NUL-terminated string; @volume_key_size indicates the size of the volume
+ * key in bytes. cookie_key is binary data, which is netfs specific;
+ * @cookie_key_size indicates the size of the cookie key in bytes.
+ *
+ * @fd identifies an anon_fd referring to the cache file.
+ */
+struct cachefiles_open {
+	__u32 volume_key_size;
+	__u32 cookie_key_size;
+	__u32 fd;
+	__u32 flags;
+	__u8  data[];
+};
+
+#endif
-- 
cgit v1.2.3


From 324b954ac80cff0d11ddb6bde9b6631e45e98620 Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Mon, 25 Apr 2022 20:21:26 +0800
Subject: cachefiles: notify the user daemon when withdrawing cookie

Notify the user daemon that cookie is going to be withdrawn, providing a
hint that the associated anonymous fd can be closed.

Be noted that this is only a hint. The user daemon may close the
associated anonymous fd when receiving the CLOSE request, then it will
receive another anonymous fd when the cookie gets looked up. Or it may
ignore the CLOSE request, and keep writing data through the anonymous
fd. However the next time the cookie gets looked up, the user daemon
will still receive another new anonymous fd.

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Acked-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20220425122143.56815-5-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/cachefiles/interface.c       |  2 ++
 fs/cachefiles/internal.h        |  5 +++++
 fs/cachefiles/ondemand.c        | 38 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/cachefiles.h |  1 +
 4 files changed, 46 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ae93cee9d25d..a69073a1d3f0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -362,6 +362,8 @@ static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
 		spin_unlock(&cache->object_list_lock);
 	}
 
+	cachefiles_ondemand_clean_object(object);
+
 	if (object->file) {
 		cachefiles_begin_secure(cache, &saved_cred);
 		cachefiles_clean_up_object(object, cache);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index e5c612888f84..da388ba127eb 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -290,6 +290,7 @@ extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
 				     char *args);
 
 extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
 
 #else
 static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
@@ -302,6 +303,10 @@ static inline int cachefiles_ondemand_init_object(struct cachefiles_object *obje
 {
 	return 0;
 }
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
 #endif
 
 /*
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 7946ee6c40be..11b1c15ac697 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -229,6 +229,12 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
 		goto err_put_fd;
 	}
 
+	/* CLOSE request has no reply */
+	if (msg->opcode == CACHEFILES_OP_CLOSE) {
+		xa_erase(&cache->reqs, id);
+		complete(&req->done);
+	}
+
 	return n;
 
 err_put_fd:
@@ -300,6 +306,13 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
 		/* coupled with the barrier in cachefiles_flush_reqs() */
 		smp_mb();
 
+		if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
+			WARN_ON_ONCE(object->ondemand_id == 0);
+			xas_unlock(&xas);
+			ret = -EIO;
+			goto out;
+		}
+
 		xas.xa_index = 0;
 		xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
 		if (xas.xa_node == XAS_RESTART)
@@ -356,6 +369,25 @@ static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
 	return 0;
 }
 
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+					      void *private)
+{
+	struct cachefiles_object *object = req->object;
+	int object_id = object->ondemand_id;
+
+	/*
+	 * It's possible that object id is still 0 if the cookie looking up
+	 * phase failed before OPEN request has ever been sent. Also avoid
+	 * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
+	 * anon_fd has already been closed.
+	 */
+	if (object_id <= 0)
+		return -ENOENT;
+
+	req->msg.object_id = object_id;
+	return 0;
+}
+
 int cachefiles_ondemand_init_object(struct cachefiles_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
@@ -379,3 +411,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
 	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
 			data_len, cachefiles_ondemand_init_open_req, NULL);
 }
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+	cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+			cachefiles_ondemand_init_close_req, NULL);
+}
diff --git a/include/uapi/linux/cachefiles.h b/include/uapi/linux/cachefiles.h
index 521f2fe4fe9c..37a0071037c8 100644
--- a/include/uapi/linux/cachefiles.h
+++ b/include/uapi/linux/cachefiles.h
@@ -12,6 +12,7 @@
 
 enum cachefiles_opcode {
 	CACHEFILES_OP_OPEN,
+	CACHEFILES_OP_CLOSE,
 };
 
 /*
-- 
cgit v1.2.3


From 9032b6e8589f269743984aac53e82e4835be16dc Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Mon, 25 Apr 2022 20:21:27 +0800
Subject: cachefiles: implement on-demand read

Implement the data plane of on-demand read mode.

The early implementation [1] place the entry to
cachefiles_ondemand_read() in fscache_read(). However, fscache_read()
can only detect if the requested file range is fully cache miss, whilst
we need to notify the user daemon as long as there's a hole inside the
requested file range.

Thus the entry is now placed in cachefiles_prepare_read(). When working
in on-demand read mode, once a hole detected, the read routine will send
a READ request to the user daemon. The user daemon needs to fetch the
data and write it to the cache file. After sending the READ request, the
read routine will hang there, until the READ request is handled by the
user daemon. Then it will retry to read from the same file range. If no
progress encountered, the read routine will fail then.

A new NETFS_SREQ_ONDEMAND flag is introduced to indicate that on-demand
read should be done when a cache miss encountered.

[1] https://lore.kernel.org/all/20220406075612.60298-6-jefflexu@linux.alibaba.com/ #v8

Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Acked-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20220425122143.56815-6-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/cachefiles/internal.h        |  9 +++++
 fs/cachefiles/io.c              | 15 ++++++--
 fs/cachefiles/ondemand.c        | 77 +++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h           |  1 +
 include/uapi/linux/cachefiles.h | 17 +++++++++
 5 files changed, 117 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index da388ba127eb..6cba2c6de2f9 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -292,6 +292,9 @@ extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
 extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
 extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
 
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+				    loff_t pos, size_t len);
+
 #else
 static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
 					char __user *_buffer, size_t buflen)
@@ -307,6 +310,12 @@ static inline int cachefiles_ondemand_init_object(struct cachefiles_object *obje
 static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
 {
 }
+
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+					   loff_t pos, size_t len)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /*
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 50a14e8f0aac..000a28f46e59 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -403,6 +403,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
 	enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
 	loff_t off, to;
 	ino_t ino = file ? file_inode(file)->i_ino : 0;
+	int rc;
 
 	_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
 
@@ -415,7 +416,8 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
 		__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 		why = cachefiles_trace_read_no_data;
-		goto out_no_object;
+		if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+			goto out_no_object;
 	}
 
 	/* The object and the file may be being created in the background. */
@@ -432,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
 	object = cachefiles_cres_object(cres);
 	cache = object->volume->cache;
 	cachefiles_begin_secure(cache, &saved_cred);
-
+retry:
 	off = cachefiles_inject_read_error();
 	if (off == 0)
 		off = vfs_llseek(file, subreq->start, SEEK_DATA);
@@ -483,6 +485,15 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
 
 download_and_store:
 	__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+	if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
+		rc = cachefiles_ondemand_read(object, subreq->start,
+					      subreq->len);
+		if (!rc) {
+			__clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+			goto retry;
+		}
+		ret = NETFS_INVALID_READ;
+	}
 out:
 	cachefiles_end_secure(cache, saved_cred);
 out_no_object:
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 11b1c15ac697..3470d4e8f0cb 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -10,8 +10,25 @@ static int cachefiles_ondemand_fd_release(struct inode *inode,
 	struct cachefiles_object *object = file->private_data;
 	struct cachefiles_cache *cache = object->volume->cache;
 	int object_id = object->ondemand_id;
+	struct cachefiles_req *req;
+	XA_STATE(xas, &cache->reqs, 0);
 
+	xa_lock(&cache->reqs);
 	object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+
+	/*
+	 * Flush all pending READ requests since their completion depends on
+	 * anon_fd.
+	 */
+	xas_for_each(&xas, req, ULONG_MAX) {
+		if (req->msg.opcode == CACHEFILES_OP_READ) {
+			req->error = -EIO;
+			complete(&req->done);
+			xas_store(&xas, NULL);
+		}
+	}
+	xa_unlock(&cache->reqs);
+
 	xa_erase(&cache->ondemand_ids, object_id);
 	cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
 	cachefiles_put_unbind_pincount(cache);
@@ -57,11 +74,35 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
 	return vfs_llseek(file, pos, whence);
 }
 
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+					 unsigned long arg)
+{
+	struct cachefiles_object *object = filp->private_data;
+	struct cachefiles_cache *cache = object->volume->cache;
+	struct cachefiles_req *req;
+	unsigned long id;
+
+	if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+		return -EINVAL;
+
+	if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+		return -EOPNOTSUPP;
+
+	id = arg;
+	req = xa_erase(&cache->reqs, id);
+	if (!req)
+		return -EINVAL;
+
+	complete(&req->done);
+	return 0;
+}
+
 static const struct file_operations cachefiles_ondemand_fd_fops = {
 	.owner		= THIS_MODULE,
 	.release	= cachefiles_ondemand_fd_release,
 	.write_iter	= cachefiles_ondemand_fd_write_iter,
 	.llseek		= cachefiles_ondemand_fd_llseek,
+	.unlocked_ioctl	= cachefiles_ondemand_fd_ioctl,
 };
 
 /*
@@ -388,6 +429,32 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
 	return 0;
 }
 
+struct cachefiles_read_ctx {
+	loff_t off;
+	size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+					     void *private)
+{
+	struct cachefiles_object *object = req->object;
+	struct cachefiles_read *load = (void *)req->msg.data;
+	struct cachefiles_read_ctx *read_ctx = private;
+	int object_id = object->ondemand_id;
+
+	/* Stop enqueuing requests when daemon has closed anon_fd. */
+	if (object_id <= 0) {
+		WARN_ON_ONCE(object_id == 0);
+		pr_info_once("READ: anonymous fd closed prematurely.\n");
+		return -EIO;
+	}
+
+	req->msg.object_id = object_id;
+	load->off = read_ctx->off;
+	load->len = read_ctx->len;
+	return 0;
+}
+
 int cachefiles_ondemand_init_object(struct cachefiles_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
@@ -417,3 +484,13 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
 	cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
 			cachefiles_ondemand_init_close_req, NULL);
 }
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+			     loff_t pos, size_t len)
+{
+	struct cachefiles_read_ctx read_ctx = {pos, len};
+
+	return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+			sizeof(struct cachefiles_read),
+			cachefiles_ondemand_init_read_req, &read_ctx);
+}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c7bf1eaf51d5..057d04efaf79 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -159,6 +159,7 @@ struct netfs_io_subrequest {
 #define NETFS_SREQ_SHORT_IO		2	/* Set if the I/O was short */
 #define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
 #define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
+#define NETFS_SREQ_ONDEMAND		5	/* Set if it's from on-demand read mode */
 };
 
 enum netfs_io_origin {
diff --git a/include/uapi/linux/cachefiles.h b/include/uapi/linux/cachefiles.h
index 37a0071037c8..78caa73e5343 100644
--- a/include/uapi/linux/cachefiles.h
+++ b/include/uapi/linux/cachefiles.h
@@ -3,6 +3,7 @@
 #define _LINUX_CACHEFILES_H
 
 #include <linux/types.h>
+#include <linux/ioctl.h>
 
 /*
  * Fscache ensures that the maximum length of cookie key is 255. The volume key
@@ -13,6 +14,7 @@
 enum cachefiles_opcode {
 	CACHEFILES_OP_OPEN,
 	CACHEFILES_OP_CLOSE,
+	CACHEFILES_OP_READ,
 };
 
 /*
@@ -48,4 +50,19 @@ struct cachefiles_open {
 	__u8  data[];
 };
 
+/*
+ * @off		indicates the starting offset of the requested file range
+ * @len		indicates the length of the requested file range
+ */
+struct cachefiles_read {
+	__u64 off;
+	__u64 len;
+};
+
+/*
+ * Reply for READ request
+ * @arg for this ioctl is the @id field of READ request.
+ */
+#define CACHEFILES_IOC_READ_COMPLETE	_IOW(0x98, 1, int)
+
 #endif
-- 
cgit v1.2.3


From c7fb19428d67dd0a2a78a4f237af01d39c78dc5a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Apr 2022 14:38:53 -0600
Subject: io_uring: add support for ring mapped supplied buffers

Provided buffers allow an application to supply io_uring with buffers
that can then be grabbed for a read/receive request, when the data
source is ready to deliver data. The existing scheme relies on using
IORING_OP_PROVIDE_BUFFERS to do that, but it can be difficult to use
in real world applications. It's pretty efficient if the application
is able to supply back batches of provided buffers when they have been
consumed and the application is ready to recycle them, but if
fragmentation occurs in the buffer space, it can become difficult to
supply enough buffers at the time. This hurts efficiency.

Add a register op, IORING_REGISTER_PBUF_RING, which allows an application
to setup a shared queue for each buffer group of provided buffers. The
application can then supply buffers simply by adding them to this ring,
and the kernel can consume then just as easily. The ring shares the head
with the application, the tail remains private in the kernel.

Provided buffers setup with IORING_REGISTER_PBUF_RING cannot use
IORING_OP_{PROVIDE,REMOVE}_BUFFERS for adding or removing entries to the
ring, they must use the mapped ring. Mapped provided buffer rings can
co-exist with normal provided buffers, just not within the same group ID.

To gauge overhead of the existing scheme and evaluate the mapped ring
approach, a simple NOP benchmark was written. It uses a ring of 128
entries, and submits/completes 32 at the time. 'Replenish' is how
many buffers are provided back at the time after they have been
consumed:

Test			Replenish			NOPs/sec
================================================================
No provided buffers	NA				~30M
Provided buffers	32				~16M
Provided buffers	 1				~10M
Ring buffers		32				~27M
Ring buffers		 1				~27M

The ring mapped buffers perform almost as well as not using provided
buffers at all, and they don't care if you provided 1 or more back at
the same time. This means application can just replenish as they go,
rather than need to batch and compact, further reducing overhead in the
application. The NOP benchmark above doesn't need to do any compaction,
so that overhead isn't even reflected in the above test.

Co-developed-by: Dylan Yudaken <dylany@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 234 +++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  36 +++++++
 2 files changed, 258 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 384cdbd40941..78192a9e7684 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -285,9 +285,26 @@ struct io_rsrc_data {
 	bool				quiesce;
 };
 
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
 struct io_buffer_list {
-	struct list_head buf_list;
+	/*
+	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+	 * then these are classic provided buffers and ->buf_list is used.
+	 */
+	union {
+		struct list_head buf_list;
+		struct {
+			struct page **buf_pages;
+			struct io_uring_buf_ring *buf_ring;
+		};
+	};
 	__u16 bgid;
+
+	/* below is for ring provided buffers */
+	__u16 buf_nr_pages;
+	__u16 nr_entries;
+	__u32 head;
+	__u32 mask;
 };
 
 struct io_buffer {
@@ -804,6 +821,7 @@ enum {
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
+	REQ_F_BUFFER_RING_BIT,
 	REQ_F_COMPLETE_INLINE_BIT,
 	REQ_F_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
@@ -855,6 +873,8 @@ enum {
 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
+	/* buffer selected from ring, needs commit */
+	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT),
 	/* completion is deferred through io_comp_state */
 	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 	/* caller should reissue async */
@@ -979,6 +999,12 @@ struct io_kiocb {
 
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 		struct io_buffer	*kbuf;
+
+		/*
+		 * stores buffer ID for ring provided buffers, valid IFF
+		 * REQ_F_BUFFER_RING is set.
+		 */
+		struct io_buffer_list	*buf_list;
 	};
 
 	union {
@@ -1470,8 +1496,14 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 
 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
 {
-	req->flags &= ~REQ_F_BUFFER_SELECTED;
-	list_add(&req->kbuf->list, list);
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list)
+			req->buf_list->head++;
+		req->flags &= ~REQ_F_BUFFER_RING;
+	} else {
+		list_add(&req->kbuf->list, list);
+		req->flags &= ~REQ_F_BUFFER_SELECTED;
+	}
 
 	return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 }
@@ -1480,7 +1512,7 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 {
 	lockdep_assert_held(&req->ctx->completion_lock);
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
 	return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
 }
@@ -1490,7 +1522,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 {
 	unsigned int cflags;
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
 
 	/*
@@ -1505,7 +1537,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 	 * We migrate buffers from the comp_list to the issue cache list
 	 * when we need one.
 	 */
-	if (issue_flags & IO_URING_F_UNLOCKED) {
+	if (req->flags & REQ_F_BUFFER_RING) {
+		/* no buffers to recycle for this case */
+		cflags = __io_put_kbuf(req, NULL);
+	} else if (issue_flags & IO_URING_F_UNLOCKED) {
 		struct io_ring_ctx *ctx = req->ctx;
 
 		spin_lock(&ctx->completion_lock);
@@ -1535,11 +1570,23 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	struct io_buffer_list *bl;
 	struct io_buffer *buf;
 
-	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return;
 	/* don't recycle if we already did IO to this buffer */
 	if (req->flags & REQ_F_PARTIAL_IO)
 		return;
+	/*
+	 * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+	 * the flag and hence ensure that bl->head doesn't get incremented.
+	 * If the tail has already been incremented, hang on to it.
+	 */
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (req->buf_list) {
+			req->buf_index = req->buf_list->bgid;
+			req->flags &= ~REQ_F_BUFFER_RING;
+		}
+		return;
+	}
 
 	io_ring_submit_lock(ctx, issue_flags);
 
@@ -3487,6 +3534,53 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+					  struct io_buffer_list *bl,
+					  unsigned int issue_flags)
+{
+	struct io_uring_buf_ring *br = bl->buf_ring;
+	struct io_uring_buf *buf;
+	__u32 head = bl->head;
+
+	if (unlikely(smp_load_acquire(&br->tail) == head)) {
+		io_ring_submit_unlock(req->ctx, issue_flags);
+		return ERR_PTR(-ENOBUFS);
+	}
+
+	head &= bl->mask;
+	if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+		buf = &br->bufs[head];
+	} else {
+		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+		buf = page_address(bl->buf_pages[index]);
+		buf += off;
+	}
+	if (*len > buf->len)
+		*len = buf->len;
+	req->flags |= REQ_F_BUFFER_RING;
+	req->buf_list = bl;
+	req->buf_index = buf->bid;
+
+	if (!(issue_flags & IO_URING_F_UNLOCKED))
+		return u64_to_user_ptr(buf->addr);
+
+	/*
+	 * If we came in unlocked, we have no choice but to
+	 * consume the buffer here. This does mean it'll be
+	 * pinned until the IO completes. But coming in
+	 * unlocked means we're in io-wq context, hence there
+	 * should be no further retry. For the locked case, the
+	 * caller must ensure to call the commit when the
+	 * transfer completes (or if we get -EAGAIN and must
+	 * poll or retry).
+	 */
+	req->buf_list = NULL;
+	bl->head++;
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return u64_to_user_ptr(buf->addr);
+}
+
 static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 				     unsigned int issue_flags)
 {
@@ -3502,6 +3596,9 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	}
 
 	/* selection helpers drop the submit lock again, if needed */
+	if (bl->buf_nr_pages)
+		return io_ring_buffer_select(req, len, bl, issue_flags);
+
 	return io_provided_buffer_select(req, len, bl, issue_flags);
 }
 
@@ -3558,7 +3655,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
-	if (req->flags & REQ_F_BUFFER_SELECTED) {
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
 		iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
 		iov[0].iov_len = req->rw.len;
 		return 0;
@@ -3578,7 +3675,7 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
 		return false;
-	return !(req->flags & REQ_F_BUFFER_SELECTED);
+	return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
 }
 
 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
@@ -4872,6 +4969,18 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
+	if (bl->buf_nr_pages) {
+		int j;
+
+		i = bl->buf_ring->tail - bl->head;
+		for (j = 0; j < bl->buf_nr_pages; j++)
+			unpin_user_page(bl->buf_pages[j]);
+		kvfree(bl->buf_pages);
+		bl->buf_pages = NULL;
+		bl->buf_nr_pages = 0;
+		return i;
+	}
+
 	/* the head kbuf is the list itself */
 	while (!list_empty(&bl->buf_list)) {
 		struct io_buffer *nxt;
@@ -4898,8 +5007,12 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = -ENOENT;
 	bl = io_buffer_get_list(ctx, p->bgid);
-	if (bl)
-		ret = __io_remove_buffers(ctx, bl, p->nbufs);
+	if (bl) {
+		ret = -EINVAL;
+		/* can't use provide/remove buffers command on mapped buffers */
+		if (!bl->buf_nr_pages)
+			ret = __io_remove_buffers(ctx, bl, p->nbufs);
+	}
 	if (ret < 0)
 		req_set_fail(req);
 
@@ -5047,7 +5160,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	bl = io_buffer_get_list(ctx, p->bgid);
 	if (unlikely(!bl)) {
-		bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 		if (!bl) {
 			ret = -ENOMEM;
 			goto err;
@@ -5058,6 +5171,11 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 			goto err;
 		}
 	}
+	/* can't add buffers via this command for a mapped buffer ring */
+	if (bl->buf_nr_pages) {
+		ret = -EINVAL;
+		goto err;
+	}
 
 	ret = io_add_buffers(ctx, p, bl);
 err:
@@ -12011,6 +12129,83 @@ err:
 	return ret;
 }
 
+static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_ring *br;
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl;
+	struct page **pages;
+	int nr_pages;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+	if (!reg.ring_addr)
+		return -EFAULT;
+	if (reg.ring_addr & ~PAGE_MASK)
+		return -EINVAL;
+	if (!is_power_of_2(reg.ring_entries))
+		return -EINVAL;
+
+	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+		int ret = io_init_bl_list(ctx);
+		if (ret)
+			return ret;
+	}
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (bl && bl->buf_nr_pages)
+		return -EEXIST;
+	if (!bl) {
+		bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+		if (!bl)
+			return -ENOMEM;
+	}
+
+	pages = io_pin_pages(reg.ring_addr,
+			     struct_size(br, bufs, reg.ring_entries),
+			     &nr_pages);
+	if (IS_ERR(pages)) {
+		kfree(bl);
+		return PTR_ERR(pages);
+	}
+
+	br = page_address(pages[0]);
+	bl->buf_pages = pages;
+	bl->buf_nr_pages = nr_pages;
+	bl->nr_entries = reg.ring_entries;
+	bl->buf_ring = br;
+	bl->mask = reg.ring_entries - 1;
+	io_buffer_add_list(ctx, bl, reg.bgid);
+	return 0;
+}
+
+static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_buf_reg reg;
+	struct io_buffer_list *bl;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+		return -EINVAL;
+
+	bl = io_buffer_get_list(ctx, reg.bgid);
+	if (!bl)
+		return -ENOENT;
+	if (!bl->buf_nr_pages)
+		return -EINVAL;
+
+	__io_remove_buffers(ctx, bl, -1U);
+	if (bl->bgid >= BGID_ARRAY) {
+		xa_erase(&ctx->io_bl_xa, bl->bgid);
+		kfree(bl);
+	}
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -12142,6 +12337,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_UNREGISTER_RING_FDS:
 		ret = io_ringfd_unregister(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_PBUF_RING:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_pbuf_ring(ctx, arg);
+		break;
+	case IORING_UNREGISTER_PBUF_RING:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_unregister_pbuf_ring(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -12227,6 +12434,9 @@ static int __init io_uring_init(void)
 	/* ->buf_index is u16 */
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 	BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
+	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
+	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
+		     offsetof(struct io_uring_buf_ring, tail));
 
 	/* should fit into one byte */
 	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 15f821af9242..ddf969ae5a79 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -384,6 +384,10 @@ enum {
 	IORING_REGISTER_RING_FDS		= 20,
 	IORING_UNREGISTER_RING_FDS		= 21,
 
+	/* register ring based provide buffer group */
+	IORING_REGISTER_PBUF_RING		= 22,
+	IORING_UNREGISTER_PBUF_RING		= 23,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -461,6 +465,38 @@ struct io_uring_restriction {
 	__u32 resv2[3];
 };
 
+struct io_uring_buf {
+	__u64	addr;
+	__u32	len;
+	__u16	bid;
+	__u16	resv;
+};
+
+struct io_uring_buf_ring {
+	union {
+		/*
+		 * To avoid spilling into more pages than we need to, the
+		 * ring tail is overlaid with the io_uring_buf->resv field.
+		 */
+		struct {
+			__u64	resv1;
+			__u32	resv2;
+			__u16	resv3;
+			__u16	tail;
+		};
+		struct io_uring_buf	bufs[0];
+	};
+};
+
+/* argument for IORING_(UN)REGISTER_PBUF_RING */
+struct io_uring_buf_reg {
+	__u64	ring_addr;
+	__u32	ring_entries;
+	__u16	bgid;
+	__u16	pad;
+	__u64	resv[3];
+};
+
 /*
  * io_uring_restriction->opcode values
  */
-- 
cgit v1.2.3


From c1318b39c7d36bd5139a9c71044ff2b2d3c6f9d8 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@nvidia.com>
Date: Wed, 18 May 2022 12:27:31 +0300
Subject: tls: Add opt-in zerocopy mode of sendfile()

TLS device offload copies sendfile data to a bounce buffer before
transmitting. It allows to maintain the valid MAC on TLS records when
the file contents change and a part of TLS record has to be
retransmitted on TCP level.

In many common use cases (like serving static files over HTTPS) the file
contents are not changed on the fly. In many use cases breaking the
connection is totally acceptable if the file is changed during
transmission, because it would be received corrupted in any case.

This commit allows to optimize performance for such use cases to
providing a new optional mode of TLS sendfile(), in which the extra copy
is skipped. Removing this copy improves performance significantly, as
TLS and TCP sendfile perform the same operations, and the only overhead
is TLS header/trailer insertion.

The new mode can only be enabled with the new socket option named
TLS_TX_ZEROCOPY_SENDFILE on per-socket basis. It preserves backwards
compatibility with existing applications that rely on the copying
behavior.

The new mode is safe, meaning that unsolicited modifications of the file
being sent can't break integrity of the kernel. The worst thing that can
happen is sending a corrupted TLS record, which is in any case not
forbidden when using regular TCP sockets.

Sockets other than TLS device offload are not affected by the new socket
option. The actual status of zerocopy sendfile can be queried with
sock_diag.

Performance numbers in a single-core test with 24 HTTPS streams on
nginx, under 100% CPU load:

* non-zerocopy: 33.6 Gbit/s
* zerocopy: 79.92 Gbit/s

CPU: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz

Signed-off-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220518092731.1243494-1-maximmi@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tls.h        |  1 +
 include/uapi/linux/tls.h |  2 ++
 net/tls/tls_device.c     | 53 ++++++++++++++++++++++++++++++++++------------
 net/tls/tls_main.c       | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tls.h b/include/net/tls.h
index b59f0a63292b..8017f1703447 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -238,6 +238,7 @@ struct tls_context {
 
 	u8 tx_conf:3;
 	u8 rx_conf:3;
+	u8 zerocopy_sendfile:1;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 5f38be0ec0f3..ac39328eabe7 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -39,6 +39,7 @@
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
 #define TLS_RX			2	/* Set receive parameters */
+#define TLS_TX_ZEROCOPY_SENDFILE	3	/* transmit zerocopy sendfile */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -160,6 +161,7 @@ enum {
 	TLS_INFO_CIPHER,
 	TLS_INFO_TXCONF,
 	TLS_INFO_RXCONF,
+	TLS_INFO_ZC_SENDFILE,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index bca00521ebc1..ec6f4b699a2b 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
 	return 0;
 }
 
+union tls_iter_offset {
+	struct iov_iter *msg_iter;
+	int offset;
+};
+
 static int tls_push_data(struct sock *sk,
-			 struct iov_iter *msg_iter,
+			 union tls_iter_offset iter_offset,
 			 size_t size, int flags,
-			 unsigned char record_type)
+			 unsigned char record_type,
+			 struct page *zc_page)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
@@ -480,12 +486,21 @@ handle_error:
 		}
 
 		record = ctx->open_record;
-		copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
-		copy = min_t(size_t, copy, (max_open_record_len - record->len));
 
-		if (copy) {
+		copy = min_t(size_t, size, max_open_record_len - record->len);
+		if (copy && zc_page) {
+			struct page_frag zc_pfrag;
+
+			zc_pfrag.page = zc_page;
+			zc_pfrag.offset = iter_offset.offset;
+			zc_pfrag.size = copy;
+			tls_append_frag(record, &zc_pfrag, copy);
+		} else if (copy) {
+			copy = min_t(size_t, copy, pfrag->size - pfrag->offset);
+
 			rc = tls_device_copy_data(page_address(pfrag->page) +
-						  pfrag->offset, copy, msg_iter);
+						  pfrag->offset, copy,
+						  iter_offset.msg_iter);
 			if (rc)
 				goto handle_error;
 			tls_append_frag(record, pfrag, copy);
@@ -540,6 +555,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	unsigned char record_type = TLS_RECORD_TYPE_DATA;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	union tls_iter_offset iter;
 	int rc;
 
 	mutex_lock(&tls_ctx->tx_lock);
@@ -551,8 +567,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out;
 	}
 
-	rc = tls_push_data(sk, &msg->msg_iter, size,
-			   msg->msg_flags, record_type);
+	iter.msg_iter = &msg->msg_iter;
+	rc = tls_push_data(sk, iter, size, msg->msg_flags, record_type, NULL);
 
 out:
 	release_sock(sk);
@@ -564,7 +580,8 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
 			int offset, size_t size, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct iov_iter	msg_iter;
+	union tls_iter_offset iter_offset;
+	struct iov_iter msg_iter;
 	char *kaddr;
 	struct kvec iov;
 	int rc;
@@ -580,12 +597,20 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
 		goto out;
 	}
 
+	if (tls_ctx->zerocopy_sendfile) {
+		iter_offset.offset = offset;
+		rc = tls_push_data(sk, iter_offset, size,
+				   flags, TLS_RECORD_TYPE_DATA, page);
+		goto out;
+	}
+
 	kaddr = kmap(page);
 	iov.iov_base = kaddr + offset;
 	iov.iov_len = size;
 	iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size);
-	rc = tls_push_data(sk, &msg_iter, size,
-			   flags, TLS_RECORD_TYPE_DATA);
+	iter_offset.msg_iter = &msg_iter;
+	rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA,
+			   NULL);
 	kunmap(page);
 
 out:
@@ -656,10 +681,12 @@ EXPORT_SYMBOL(tls_get_record);
 
 static int tls_device_push_pending_record(struct sock *sk, int flags)
 {
-	struct iov_iter	msg_iter;
+	union tls_iter_offset iter;
+	struct iov_iter msg_iter;
 
 	iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0);
-	return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
+	iter.msg_iter = &msg_iter;
+	return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL);
 }
 
 void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 7b2b0e7ffee4..b91ddc110786 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -513,6 +513,26 @@ out:
 	return rc;
 }
 
+static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
+				   int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	unsigned int value;
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len != sizeof(value))
+		return -EINVAL;
+
+	value = ctx->zerocopy_sendfile;
+	if (copy_to_user(optval, &value, sizeof(value)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int do_tls_getsockopt(struct sock *sk, int optname,
 			     char __user *optval, int __user *optlen)
 {
@@ -524,6 +544,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
 		rc = do_tls_getsockopt_conf(sk, optval, optlen,
 					    optname == TLS_TX);
 		break;
+	case TLS_TX_ZEROCOPY_SENDFILE:
+		rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -675,6 +698,26 @@ err_crypto_info:
 	return rc;
 }
 
+static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
+				   unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	unsigned int value;
+
+	if (sockptr_is_null(optval) || optlen != sizeof(value))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&value, optval, sizeof(value)))
+		return -EFAULT;
+
+	if (value > 1)
+		return -EINVAL;
+
+	ctx->zerocopy_sendfile = value;
+
+	return 0;
+}
+
 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			     unsigned int optlen)
 {
@@ -688,6 +731,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 					    optname == TLS_TX);
 		release_sock(sk);
 		break;
+	case TLS_TX_ZEROCOPY_SENDFILE:
+		lock_sock(sk);
+		rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
+		release_sock(sk);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -921,6 +969,12 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
 	if (err)
 		goto nla_failure;
 
+	if (ctx->tx_conf == TLS_HW && ctx->zerocopy_sendfile) {
+		err = nla_put_flag(skb, TLS_INFO_ZC_SENDFILE);
+		if (err)
+			goto nla_failure;
+	}
+
 	rcu_read_unlock();
 	nla_nest_end(skb, start);
 	return 0;
@@ -940,6 +994,7 @@ static size_t tls_get_info_size(const struct sock *sk)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_CIPHER */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_RXCONF */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
+		nla_total_size(0) +		/* TLS_INFO_ZC_SENDFILE */
 		0;
 
 	return size;
-- 
cgit v1.2.3


From 58e5bdeb9c2b06895e723c0b1e670f54510ff782 Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Fri, 20 May 2022 14:36:30 +0530
Subject: nvme: enable uring-passthrough for admin commands

Add two new opcodes that userspace can use for admin commands:
NVME_URING_CMD_ADMIN : non-vectroed
NVME_URING_CMD_ADMIN_VEC : vectored variant

Wire up support when these are issued on controller node(/dev/nvmeX).

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220520090630.70394-3-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c        |  1 +
 drivers/nvme/host/ioctl.c       | 23 +++++++++++++++++++++++
 drivers/nvme/host/nvme.h        |  1 +
 include/uapi/linux/nvme_ioctl.h |  2 ++
 4 files changed, 27 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 682df98db341..1a984045e49c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = {
 	.release	= nvme_dev_release,
 	.unlocked_ioctl	= nvme_dev_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.uring_cmd	= nvme_dev_uring_cmd,
 };
 
 static ssize_t nvme_sysfs_reset(struct device *dev,
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 114b490592b0..096b1b47d750 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -686,6 +686,29 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
+int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+{
+	struct nvme_ctrl *ctrl = ioucmd->file->private_data;
+	int ret;
+
+	ret = nvme_uring_cmd_checks(issue_flags);
+	if (ret)
+		return ret;
+
+	switch (ioucmd->cmd_op) {
+	case NVME_URING_CMD_ADMIN:
+		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false);
+		break;
+	case NVME_URING_CMD_ADMIN_VEC:
+		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 086ccbdd7003..26d35c557588 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -787,6 +787,7 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
 extern const struct pr_ops nvme_pr_ops;
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 0b1876aa5a59..2f76cba67166 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -108,5 +108,7 @@ struct nvme_uring_cmd {
 /* io_uring async commands: */
 #define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
 #define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd)
+#define NVME_URING_CMD_ADMIN	_IOWR('N', 0x82, struct nvme_uring_cmd)
+#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
cgit v1.2.3


From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Thu, 19 May 2022 16:30:10 -0700
Subject: bpf: Add bpf_skc_to_mptcp_sock_proto

This patch implements a new struct bpf_func_proto, named
bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP,
and a new helper bpf_skc_to_mptcp_sock(), which invokes another new
helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct
mptcp_sock from a given subflow socket.

v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c,
remove build check for CONFIG_BPF_JIT
v5: Drop EXPORT_SYMBOL (Martin)

Co-developed-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com
---
 include/linux/bpf.h            |  1 +
 include/linux/btf_ids.h        |  3 ++-
 include/net/mptcp.h            |  6 ++++++
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/verifier.c          |  1 +
 kernel/trace/bpf_trace.c       |  2 ++
 net/core/filter.c              | 18 ++++++++++++++++++
 net/mptcp/Makefile             |  2 ++
 net/mptcp/bpf.c                | 21 +++++++++++++++++++++
 scripts/bpf_doc.py             |  2 ++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 11 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 net/mptcp/bpf.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c107392b0ba7..a3ef078401cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2231,6 +2231,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
 extern const struct bpf_func_proto bpf_snprintf_proto;
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index bc5d9cc34e4c..335a19092368 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -178,7 +178,8 @@ extern struct btf_id_set name;
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 8b1afd6f5cc4..2ba09de955c7 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -284,4 +284,10 @@ static inline int mptcpv6_init(void) { return 0; }
 static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
 #endif
 
+#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL)
+struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
+#else
+static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
+#endif
+
 #endif /* __NET_MPTCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0210f85131b3..56688bee20d9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5172,6 +5172,12 @@ union bpf_attr {
  * 	Return
  * 		Map value associated to *key* on *cpu*, or **NULL** if no entry
  * 		was found or *cpu* is invalid.
+ *
+ * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *mptcp_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or **NULL** otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5370,6 +5376,7 @@ union bpf_attr {
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
 	FN(map_lookup_percpu_elem),     \
+	FN(skc_to_mptcp_sock),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9b59581026f8..14e8c17d3d8d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -509,6 +509,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_skc_to_tcp_sock ||
 		func_id == BPF_FUNC_skc_to_tcp6_sock ||
 		func_id == BPF_FUNC_skc_to_udp6_sock ||
+		func_id == BPF_FUNC_skc_to_mptcp_sock ||
 		func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
 		func_id == BPF_FUNC_skc_to_tcp_request_sock;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7141ca8a1c2d..10b157a6d73e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1705,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_to_udp6_sock_proto;
 	case BPF_FUNC_skc_to_unix_sock:
 		return &bpf_skc_to_unix_sock_proto;
+	case BPF_FUNC_skc_to_mptcp_sock:
+		return &bpf_skc_to_mptcp_sock_proto;
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_tracing_proto;
 	case BPF_FUNC_sk_storage_delete:
diff --git a/net/core/filter.c b/net/core/filter.c
index fe0da529d00f..5af58eb48587 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -78,6 +78,7 @@
 #include <linux/btf_ids.h>
 #include <net/tls.h>
 #include <net/xdp.h>
+#include <net/mptcp.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -11281,6 +11282,20 @@ const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
 	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
 };
 
+BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
+{
+	BTF_TYPE_EMIT(struct mptcp_sock);
+	return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
+}
+
+const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
+	.func		= bpf_skc_to_mptcp_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
+	.ret_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
+};
+
 BPF_CALL_1(bpf_sock_from_file, struct file *, file)
 {
 	return (unsigned long)sock_from_file(file);
@@ -11323,6 +11338,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skc_to_unix_sock:
 		func = &bpf_skc_to_unix_sock_proto;
 		break;
+	case BPF_FUNC_skc_to_mptcp_sock:
+		func = &bpf_skc_to_mptcp_sock_proto;
+		break;
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index e54daceac58b..99dddf08ca73 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
 mptcp_crypto_test-objs := crypto_test.o
 mptcp_token_test-objs := token_test.o
 obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o
+
+obj-$(CONFIG_BPF_SYSCALL) += bpf.o
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
new file mode 100644
index 000000000000..5a0a84ad94af
--- /dev/null
+++ b/net/mptcp/bpf.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Tessares SA.
+ * Copyright (c) 2022, SUSE.
+ *
+ * Author: Nicolas Rybowski <nicolas.rybowski@tessares.net>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/bpf.h>
+#include "protocol.h"
+
+struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
+{
+	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
+		return mptcp_sk(mptcp_subflow_ctx(sk)->conn);
+
+	return NULL;
+}
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 096625242475..d5452f7eb996 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -633,6 +633,7 @@ class PrinterHelpers(Printer):
             'struct socket',
             'struct file',
             'struct bpf_timer',
+            'struct mptcp_sock',
     ]
     known_types = {
             '...',
@@ -682,6 +683,7 @@ class PrinterHelpers(Printer):
             'struct socket',
             'struct file',
             'struct bpf_timer',
+            'struct mptcp_sock',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0210f85131b3..56688bee20d9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5172,6 +5172,12 @@ union bpf_attr {
  * 	Return
  * 		Map value associated to *key* on *cpu*, or **NULL** if no entry
  * 		was found or *cpu* is invalid.
+ *
+ * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *mptcp_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or **NULL** otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5370,6 +5376,7 @@ union bpf_attr {
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
 	FN(map_lookup_percpu_elem),     \
+	FN(skc_to_mptcp_sock),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From a13e248ff90e81e9322406c0e618cf2168702f4e Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 6 May 2022 18:08:11 +0200
Subject: landlock: Fix landlock_add_rule(2) documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is not mandatory to pass a file descriptor obtained with the O_PATH
flag.  Also, replace rule's accesses with ruleset's accesses.

Link: https://lore.kernel.org/r/20220506160820.524344-2-mic@digikod.net
Cc: stable@vger.kernel.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 5 +++--
 security/landlock/syscalls.c  | 7 +++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 15c31abb0d76..21c8d58283c9 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -62,8 +62,9 @@ struct landlock_path_beneath_attr {
 	 */
 	__u64 allowed_access;
 	/**
-	 * @parent_fd: File descriptor, open with ``O_PATH``, which identifies
-	 * the parent directory of a file hierarchy, or just a file.
+	 * @parent_fd: File descriptor, preferably opened with ``O_PATH``,
+	 * which identifies the parent directory of a file hierarchy, or just a
+	 * file.
 	 */
 	__s32 parent_fd;
 	/*
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 2fde978bf8ca..7edc1d50e2bf 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -292,14 +292,13 @@ out_fdput:
  *
  * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
  * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
- *   &landlock_path_beneath_attr.allowed_access is not a subset of the rule's
- *   accesses);
+ *   &landlock_path_beneath_attr.allowed_access is not a subset of the
+ *   ruleset handled accesses);
  * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
  * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
  *   member of @rule_attr is not a file descriptor as expected;
  * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
- *   @rule_attr is not the expected file descriptor type (e.g. file open
- *   without O_PATH);
+ *   @rule_attr is not the expected file descriptor type;
  * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
  * - EFAULT: @rule_attr inconsistency.
  */
-- 
cgit v1.2.3


From b91c3e4ea756b12b7d992529226edce1cfd854d7 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 6 May 2022 18:10:57 +0200
Subject: landlock: Add support for file reparenting with
 LANDLOCK_ACCESS_FS_REFER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new LANDLOCK_ACCESS_FS_REFER access right to enable policy writers
to allow sandboxed processes to link and rename files from and to a
specific set of file hierarchies.  This access right should be composed
with LANDLOCK_ACCESS_FS_MAKE_* for the destination of a link or rename,
and with LANDLOCK_ACCESS_FS_REMOVE_* for a source of a rename.  This
lift a Landlock limitation that always denied changing the parent of an
inode.

Renaming or linking to the same directory is still always allowed,
whatever LANDLOCK_ACCESS_FS_REFER is used or not, because it is not
considered a threat to user data.

However, creating multiple links or renaming to a different parent
directory may lead to privilege escalations if not handled properly.
Indeed, we must be sure that the source doesn't gain more privileges by
being accessible from the destination.  This is handled by making sure
that the source hierarchy (including the referenced file or directory
itself) restricts at least as much the destination hierarchy.  If it is
not the case, an EXDEV error is returned, making it potentially possible
for user space to copy the file hierarchy instead of moving or linking
it.

Instead of creating different access rights for the source and the
destination, we choose to make it simple and consistent for users.
Indeed, considering the previous constraint, it would be weird to
require such destination access right to be also granted to the source
(to make it a superset).  Moreover, RENAME_EXCHANGE would also add to
the confusion because of paths being both a source and a destination.

See the provided documentation for additional details.

New tests are provided with a following commit.

Reviewed-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20220506161102.525323-8-mic@digikod.net
---
 include/uapi/linux/landlock.h                |  27 +-
 security/landlock/fs.c                       | 600 +++++++++++++++++++++++----
 security/landlock/limits.h                   |   2 +-
 security/landlock/syscalls.c                 |   2 +-
 tools/testing/selftests/landlock/base_test.c |   2 +-
 tools/testing/selftests/landlock/fs_test.c   |   3 +-
 6 files changed, 556 insertions(+), 80 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 21c8d58283c9..23df4e0e8ace 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -21,8 +21,14 @@ struct landlock_ruleset_attr {
 	/**
 	 * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_)
 	 * that is handled by this ruleset and should then be forbidden if no
-	 * rule explicitly allow them.  This is needed for backward
-	 * compatibility reasons.
+	 * rule explicitly allow them: it is a deny-by-default list that should
+	 * contain as much Landlock access rights as possible. Indeed, all
+	 * Landlock filesystem access rights that are not part of
+	 * handled_access_fs are allowed.  This is needed for backward
+	 * compatibility reasons.  One exception is the
+	 * LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly
+	 * handled, but must still be explicitly handled to add new rules with
+	 * this access right.
 	 */
 	__u64 handled_access_fs;
 };
@@ -112,6 +118,22 @@ struct landlock_path_beneath_attr {
  * - %LANDLOCK_ACCESS_FS_MAKE_FIFO: Create (or rename or link) a named pipe.
  * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device.
  * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link.
+ * - %LANDLOCK_ACCESS_FS_REFER: Link or rename a file from or to a different
+ *   directory (i.e. reparent a file hierarchy).  This access right is
+ *   available since the second version of the Landlock ABI.  This is also the
+ *   only access right which is always considered handled by any ruleset in
+ *   such a way that reparenting a file hierarchy is always denied by default.
+ *   To avoid privilege escalation, it is not enough to add a rule with this
+ *   access right.  When linking or renaming a file, the destination directory
+ *   hierarchy must also always have the same or a superset of restrictions of
+ *   the source hierarchy.  If it is not the case, or if the domain doesn't
+ *   handle this access right, such actions are denied by default with errno
+ *   set to EXDEV.  Linking also requires a LANDLOCK_ACCESS_FS_MAKE_* access
+ *   right on the destination directory, and renaming also requires a
+ *   LANDLOCK_ACCESS_FS_REMOVE_* access right on the source's (file or
+ *   directory) parent.  Otherwise, such actions are denied with errno set to
+ *   EACCES.  The EACCES errno prevails over EXDEV to let user space
+ *   efficiently deal with an unrecoverable error.
  *
  * .. warning::
  *
@@ -137,6 +159,7 @@ struct landlock_path_beneath_attr {
 #define LANDLOCK_ACCESS_FS_MAKE_FIFO			(1ULL << 10)
 #define LANDLOCK_ACCESS_FS_MAKE_BLOCK			(1ULL << 11)
 #define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
+#define LANDLOCK_ACCESS_FS_REFER			(1ULL << 13)
 /* clang-format on */
 
 #endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 30b42cdee52e..ec5a6247cd3e 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -4,6 +4,7 @@
  *
  * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
  * Copyright © 2018-2020 ANSSI
+ * Copyright © 2021-2022 Microsoft Corporation
  */
 
 #include <linux/atomic.h>
@@ -273,40 +274,262 @@ static inline bool is_nouser_or_private(const struct dentry *dentry)
 		unlikely(IS_PRIVATE(d_backing_inode(dentry))));
 }
 
-static int check_access_path(const struct landlock_ruleset *const domain,
-			     const struct path *const path,
-			     const access_mask_t access_request)
+static inline access_mask_t
+get_handled_accesses(const struct landlock_ruleset *const domain)
 {
-	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
-	bool allowed = false, has_access = false;
-	struct path walker_path;
-	size_t i;
+	access_mask_t access_dom = 0;
+	unsigned long access_bit;
+
+	for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
+	     access_bit++) {
+		size_t layer_level;
+
+		for (layer_level = 0; layer_level < domain->num_layers;
+		     layer_level++) {
+			if (domain->fs_access_masks[layer_level] &
+			    BIT_ULL(access_bit)) {
+				access_dom |= BIT_ULL(access_bit);
+				break;
+			}
+		}
+	}
+	return access_dom;
+}
+
+static inline access_mask_t
+init_layer_masks(const struct landlock_ruleset *const domain,
+		 const access_mask_t access_request,
+		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
+{
+	access_mask_t handled_accesses = 0;
+	size_t layer_level;
 
+	memset(layer_masks, 0, sizeof(*layer_masks));
+	/* An empty access request can happen because of O_WRONLY | O_RDWR. */
 	if (!access_request)
 		return 0;
-	if (WARN_ON_ONCE(!domain || !path))
-		return 0;
-	if (is_nouser_or_private(path->dentry))
-		return 0;
-	if (WARN_ON_ONCE(domain->num_layers < 1))
-		return -EACCES;
 
-	/* Saves all layers handling a subset of requested accesses. */
-	for (i = 0; i < domain->num_layers; i++) {
+	/* Saves all handled accesses per layer. */
+	for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
 		const unsigned long access_req = access_request;
 		unsigned long access_bit;
 
 		for_each_set_bit(access_bit, &access_req,
-				 ARRAY_SIZE(layer_masks)) {
-			if (domain->fs_access_masks[i] & BIT_ULL(access_bit)) {
-				layer_masks[access_bit] |= BIT_ULL(i);
-				has_access = true;
+				 ARRAY_SIZE(*layer_masks)) {
+			if (domain->fs_access_masks[layer_level] &
+			    BIT_ULL(access_bit)) {
+				(*layer_masks)[access_bit] |=
+					BIT_ULL(layer_level);
+				handled_accesses |= BIT_ULL(access_bit);
 			}
 		}
 	}
-	/* An access request not handled by the domain is allowed. */
-	if (!has_access)
+	return handled_accesses;
+}
+
+/*
+ * Check that a destination file hierarchy has more restrictions than a source
+ * file hierarchy.  This is only used for link and rename actions.
+ *
+ * @layer_masks_child2: Optional child masks.
+ */
+static inline bool no_more_access(
+	const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
+	const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
+	const bool child1_is_directory,
+	const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
+	const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
+	const bool child2_is_directory)
+{
+	unsigned long access_bit;
+
+	for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
+	     access_bit++) {
+		/* Ignores accesses that only make sense for directories. */
+		const bool is_file_access =
+			!!(BIT_ULL(access_bit) & ACCESS_FILE);
+
+		if (child1_is_directory || is_file_access) {
+			/*
+			 * Checks if the destination restrictions are a
+			 * superset of the source ones (i.e. inherited access
+			 * rights without child exceptions):
+			 * restrictions(parent2) >= restrictions(child1)
+			 */
+			if ((((*layer_masks_parent1)[access_bit] &
+			      (*layer_masks_child1)[access_bit]) |
+			     (*layer_masks_parent2)[access_bit]) !=
+			    (*layer_masks_parent2)[access_bit])
+				return false;
+		}
+
+		if (!layer_masks_child2)
+			continue;
+		if (child2_is_directory || is_file_access) {
+			/*
+			 * Checks inverted restrictions for RENAME_EXCHANGE:
+			 * restrictions(parent1) >= restrictions(child2)
+			 */
+			if ((((*layer_masks_parent2)[access_bit] &
+			      (*layer_masks_child2)[access_bit]) |
+			     (*layer_masks_parent1)[access_bit]) !=
+			    (*layer_masks_parent1)[access_bit])
+				return false;
+		}
+	}
+	return true;
+}
+
+/*
+ * Removes @layer_masks accesses that are not requested.
+ *
+ * Returns true if the request is allowed, false otherwise.
+ */
+static inline bool
+scope_to_request(const access_mask_t access_request,
+		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
+{
+	const unsigned long access_req = access_request;
+	unsigned long access_bit;
+
+	if (WARN_ON_ONCE(!layer_masks))
+		return true;
+
+	for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
+		(*layer_masks)[access_bit] = 0;
+	return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
+}
+
+/*
+ * Returns true if there is at least one access right different than
+ * LANDLOCK_ACCESS_FS_REFER.
+ */
+static inline bool
+is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
+	  const access_mask_t access_request)
+{
+	unsigned long access_bit;
+	/* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
+	const unsigned long access_check = access_request &
+					   ~LANDLOCK_ACCESS_FS_REFER;
+
+	if (!layer_masks)
+		return false;
+
+	for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
+		if ((*layer_masks)[access_bit])
+			return true;
+	}
+	return false;
+}
+
+/**
+ * check_access_path_dual - Check accesses for requests with a common path
+ *
+ * @domain: Domain to check against.
+ * @path: File hierarchy to walk through.
+ * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
+ *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
+ *     requested path for most actions, or the source in case of a refer action
+ *     (i.e. rename or link), or the source and destination in case of
+ *     RENAME_EXCHANGE.
+ * @layer_masks_parent1: Pointer to a matrix of layer masks per access
+ *     masks, identifying the layers that forbid a specific access.  Bits from
+ *     this matrix can be unset according to the @path walk.  An empty matrix
+ *     means that @domain allows all possible Landlock accesses (i.e. not only
+ *     those identified by @access_request_parent1).  This matrix can
+ *     initially refer to domain layer masks and, when the accesses for the
+ *     destination and source are the same, to requested layer masks.
+ * @dentry_child1: Dentry to the initial child of the parent1 path.  This
+ *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
+ * @access_request_parent2: Similar to @access_request_parent1 but for a
+ *     request involving a source and a destination.  This refers to the
+ *     destination, except in case of RENAME_EXCHANGE where it also refers to
+ *     the source.  Must be set to 0 when using a simple path request.
+ * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
+ *     action.  This must be NULL otherwise.
+ * @dentry_child2: Dentry to the initial child of the parent2 path.  This
+ *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
+ *     otherwise.
+ *
+ * This helper first checks that the destination has a superset of restrictions
+ * compared to the source (if any) for a common path.  Because of
+ * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
+ * checks that the collected accesses and the remaining ones are enough to
+ * allow the request.
+ *
+ * Returns:
+ * - 0 if the access request is granted;
+ * - -EACCES if it is denied because of access right other than
+ *   LANDLOCK_ACCESS_FS_REFER;
+ * - -EXDEV if the renaming or linking would be a privileged escalation
+ *   (according to each layered policies), or if LANDLOCK_ACCESS_FS_REFER is
+ *   not allowed by the source or the destination.
+ */
+static int check_access_path_dual(
+	const struct landlock_ruleset *const domain,
+	const struct path *const path,
+	const access_mask_t access_request_parent1,
+	layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
+	const struct dentry *const dentry_child1,
+	const access_mask_t access_request_parent2,
+	layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
+	const struct dentry *const dentry_child2)
+{
+	bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
+	     child1_is_directory = true, child2_is_directory = true;
+	struct path walker_path;
+	access_mask_t access_masked_parent1, access_masked_parent2;
+	layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
+		_layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
+	layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
+	(*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;
+
+	if (!access_request_parent1 && !access_request_parent2)
 		return 0;
+	if (WARN_ON_ONCE(!domain || !path))
+		return 0;
+	if (is_nouser_or_private(path->dentry))
+		return 0;
+	if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1))
+		return -EACCES;
+
+	if (unlikely(layer_masks_parent2)) {
+		if (WARN_ON_ONCE(!dentry_child1))
+			return -EACCES;
+		/*
+		 * For a double request, first check for potential privilege
+		 * escalation by looking at domain handled accesses (which are
+		 * a superset of the meaningful requested accesses).
+		 */
+		access_masked_parent1 = access_masked_parent2 =
+			get_handled_accesses(domain);
+		is_dom_check = true;
+	} else {
+		if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
+			return -EACCES;
+		/* For a simple request, only check for requested accesses. */
+		access_masked_parent1 = access_request_parent1;
+		access_masked_parent2 = access_request_parent2;
+		is_dom_check = false;
+	}
+
+	if (unlikely(dentry_child1)) {
+		unmask_layers(find_rule(domain, dentry_child1),
+			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
+					       &_layer_masks_child1),
+			      &_layer_masks_child1);
+		layer_masks_child1 = &_layer_masks_child1;
+		child1_is_directory = d_is_dir(dentry_child1);
+	}
+	if (unlikely(dentry_child2)) {
+		unmask_layers(find_rule(domain, dentry_child2),
+			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
+					       &_layer_masks_child2),
+			      &_layer_masks_child2);
+		layer_masks_child2 = &_layer_masks_child2;
+		child2_is_directory = d_is_dir(dentry_child2);
+	}
 
 	walker_path = *path;
 	path_get(&walker_path);
@@ -316,11 +539,52 @@ static int check_access_path(const struct landlock_ruleset *const domain,
 	 */
 	while (true) {
 		struct dentry *parent_dentry;
+		const struct landlock_rule *rule;
+
+		/*
+		 * If at least all accesses allowed on the destination are
+		 * already allowed on the source, respectively if there is at
+		 * least as much as restrictions on the destination than on the
+		 * source, then we can safely refer files from the source to
+		 * the destination without risking a privilege escalation.
+		 * This also applies in the case of RENAME_EXCHANGE, which
+		 * implies checks on both direction.  This is crucial for
+		 * standalone multilayered security policies.  Furthermore,
+		 * this helps avoid policy writers to shoot themselves in the
+		 * foot.
+		 */
+		if (unlikely(is_dom_check &&
+			     no_more_access(
+				     layer_masks_parent1, layer_masks_child1,
+				     child1_is_directory, layer_masks_parent2,
+				     layer_masks_child2,
+				     child2_is_directory))) {
+			allowed_parent1 = scope_to_request(
+				access_request_parent1, layer_masks_parent1);
+			allowed_parent2 = scope_to_request(
+				access_request_parent2, layer_masks_parent2);
+
+			/* Stops when all accesses are granted. */
+			if (allowed_parent1 && allowed_parent2)
+				break;
 
-		allowed = unmask_layers(find_rule(domain, walker_path.dentry),
-					access_request, &layer_masks);
-		if (allowed)
-			/* Stops when a rule from each layer grants access. */
+			/*
+			 * Now, downgrades the remaining checks from domain
+			 * handled accesses to requested accesses.
+			 */
+			is_dom_check = false;
+			access_masked_parent1 = access_request_parent1;
+			access_masked_parent2 = access_request_parent2;
+		}
+
+		rule = find_rule(domain, walker_path.dentry);
+		allowed_parent1 = unmask_layers(rule, access_masked_parent1,
+						layer_masks_parent1);
+		allowed_parent2 = unmask_layers(rule, access_masked_parent2,
+						layer_masks_parent2);
+
+		/* Stops when a rule from each layer grants access. */
+		if (allowed_parent1 && allowed_parent2)
 			break;
 
 jump_up:
@@ -333,7 +597,6 @@ jump_up:
 				 * Stops at the real root.  Denies access
 				 * because not all layers have granted access.
 				 */
-				allowed = false;
 				break;
 			}
 		}
@@ -343,7 +606,8 @@ jump_up:
 			 * access to internal filesystems (e.g. nsfs, which is
 			 * reachable through /proc/<pid>/ns/<namespace>).
 			 */
-			allowed = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
+			allowed_parent1 = allowed_parent2 =
+				!!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
 			break;
 		}
 		parent_dentry = dget_parent(walker_path.dentry);
@@ -351,7 +615,36 @@ jump_up:
 		walker_path.dentry = parent_dentry;
 	}
 	path_put(&walker_path);
-	return allowed ? 0 : -EACCES;
+
+	if (allowed_parent1 && allowed_parent2)
+		return 0;
+
+	/*
+	 * This prioritizes EACCES over EXDEV for all actions, including
+	 * renames with RENAME_EXCHANGE.
+	 */
+	if (likely(is_eacces(layer_masks_parent1, access_request_parent1) ||
+		   is_eacces(layer_masks_parent2, access_request_parent2)))
+		return -EACCES;
+
+	/*
+	 * Gracefully forbids reparenting if the destination directory
+	 * hierarchy is not a superset of restrictions of the source directory
+	 * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
+	 * source or the destination.
+	 */
+	return -EXDEV;
+}
+
+static inline int check_access_path(const struct landlock_ruleset *const domain,
+				    const struct path *const path,
+				    access_mask_t access_request)
+{
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	access_request = init_layer_masks(domain, access_request, &layer_masks);
+	return check_access_path_dual(domain, path, access_request,
+				      &layer_masks, NULL, 0, NULL, NULL);
 }
 
 static inline int current_check_access_path(const struct path *const path,
@@ -398,6 +691,206 @@ static inline access_mask_t maybe_remove(const struct dentry *const dentry)
 				  LANDLOCK_ACCESS_FS_REMOVE_FILE;
 }
 
+/**
+ * collect_domain_accesses - Walk through a file path and collect accesses
+ *
+ * @domain: Domain to check against.
+ * @mnt_root: Last directory to check.
+ * @dir: Directory to start the walk from.
+ * @layer_masks_dom: Where to store the collected accesses.
+ *
+ * This helper is useful to begin a path walk from the @dir directory to a
+ * @mnt_root directory used as a mount point.  This mount point is the common
+ * ancestor between the source and the destination of a renamed and linked
+ * file.  While walking from @dir to @mnt_root, we record all the domain's
+ * allowed accesses in @layer_masks_dom.
+ *
+ * This is similar to check_access_path_dual() but much simpler because it only
+ * handles walking on the same mount point and only check one set of accesses.
+ *
+ * Returns:
+ * - true if all the domain access rights are allowed for @dir;
+ * - false if the walk reached @mnt_root.
+ */
+static bool collect_domain_accesses(
+	const struct landlock_ruleset *const domain,
+	const struct dentry *const mnt_root, struct dentry *dir,
+	layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
+{
+	unsigned long access_dom;
+	bool ret = false;
+
+	if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
+		return true;
+	if (is_nouser_or_private(dir))
+		return true;
+
+	access_dom = init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
+				      layer_masks_dom);
+
+	dget(dir);
+	while (true) {
+		struct dentry *parent_dentry;
+
+		/* Gets all layers allowing all domain accesses. */
+		if (unmask_layers(find_rule(domain, dir), access_dom,
+				  layer_masks_dom)) {
+			/*
+			 * Stops when all handled accesses are allowed by at
+			 * least one rule in each layer.
+			 */
+			ret = true;
+			break;
+		}
+
+		/* We should not reach a root other than @mnt_root. */
+		if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
+			break;
+
+		parent_dentry = dget_parent(dir);
+		dput(dir);
+		dir = parent_dentry;
+	}
+	dput(dir);
+	return ret;
+}
+
+/**
+ * current_check_refer_path - Check if a rename or link action is allowed
+ *
+ * @old_dentry: File or directory requested to be moved or linked.
+ * @new_dir: Destination parent directory.
+ * @new_dentry: Destination file or directory.
+ * @removable: Sets to true if it is a rename operation.
+ * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
+ *
+ * Because of its unprivileged constraints, Landlock relies on file hierarchies
+ * (and not only inodes) to tie access rights to files.  Being able to link or
+ * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
+ * file (i.e. creating a new reference to an inode) can have an impact on the
+ * actions allowed for a set of files if it would change its parent directory
+ * (i.e. reparenting).
+ *
+ * To avoid trivial access right bypasses, Landlock first checks if the file or
+ * directory requested to be moved would gain new access rights inherited from
+ * its new hierarchy.  Before returning any error, Landlock then checks that
+ * the parent source hierarchy and the destination hierarchy would allow the
+ * link or rename action.  If it is not the case, an error with EACCES is
+ * returned to inform user space that there is no way to remove or create the
+ * requested source file type.  If it should be allowed but the new inherited
+ * access rights would be greater than the source access rights, then the
+ * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
+ * user space to abort the whole operation if there is no way to do it, or to
+ * manually copy the source to the destination if this remains allowed, e.g.
+ * because file creation is allowed on the destination directory but not direct
+ * linking.
+ *
+ * To achieve this goal, the kernel needs to compare two file hierarchies: the
+ * one identifying the source file or directory (including itself), and the
+ * destination one.  This can be seen as a multilayer partial ordering problem.
+ * The kernel walks through these paths and collects in a matrix the access
+ * rights that are denied per layer.  These matrices are then compared to see
+ * if the destination one has more (or the same) restrictions as the source
+ * one.  If this is the case, the requested action will not return EXDEV, which
+ * doesn't mean the action is allowed.  The parent hierarchy of the source
+ * (i.e. parent directory), and the destination hierarchy must also be checked
+ * to verify that they explicitly allow such action (i.e.  referencing,
+ * creation and potentially removal rights).  The kernel implementation is then
+ * required to rely on potentially four matrices of access rights: one for the
+ * source file or directory (i.e. the child), a potentially other one for the
+ * other source/destination (in case of RENAME_EXCHANGE), one for the source
+ * parent hierarchy and a last one for the destination hierarchy.  These
+ * ephemeral matrices take some space on the stack, which limits the number of
+ * layers to a deemed reasonable number: 16.
+ *
+ * Returns:
+ * - 0 if access is allowed;
+ * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
+ * - -EACCES if file removal or creation is denied.
+ */
+static int current_check_refer_path(struct dentry *const old_dentry,
+				    const struct path *const new_dir,
+				    struct dentry *const new_dentry,
+				    const bool removable, const bool exchange)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+	bool allow_parent1, allow_parent2;
+	access_mask_t access_request_parent1, access_request_parent2;
+	struct path mnt_dir;
+	layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS],
+		layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS];
+
+	if (!dom)
+		return 0;
+	if (WARN_ON_ONCE(dom->num_layers < 1))
+		return -EACCES;
+	if (unlikely(d_is_negative(old_dentry)))
+		return -ENOENT;
+	if (exchange) {
+		if (unlikely(d_is_negative(new_dentry)))
+			return -ENOENT;
+		access_request_parent1 =
+			get_mode_access(d_backing_inode(new_dentry)->i_mode);
+	} else {
+		access_request_parent1 = 0;
+	}
+	access_request_parent2 =
+		get_mode_access(d_backing_inode(old_dentry)->i_mode);
+	if (removable) {
+		access_request_parent1 |= maybe_remove(old_dentry);
+		access_request_parent2 |= maybe_remove(new_dentry);
+	}
+
+	/* The mount points are the same for old and new paths, cf. EXDEV. */
+	if (old_dentry->d_parent == new_dir->dentry) {
+		/*
+		 * The LANDLOCK_ACCESS_FS_REFER access right is not required
+		 * for same-directory referer (i.e. no reparenting).
+		 */
+		access_request_parent1 = init_layer_masks(
+			dom, access_request_parent1 | access_request_parent2,
+			&layer_masks_parent1);
+		return check_access_path_dual(dom, new_dir,
+					      access_request_parent1,
+					      &layer_masks_parent1, NULL, 0,
+					      NULL, NULL);
+	}
+
+	/* Backward compatibility: no reparenting support. */
+	if (!(get_handled_accesses(dom) & LANDLOCK_ACCESS_FS_REFER))
+		return -EXDEV;
+
+	access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
+	access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;
+
+	/* Saves the common mount point. */
+	mnt_dir.mnt = new_dir->mnt;
+	mnt_dir.dentry = new_dir->mnt->mnt_root;
+
+	/* new_dir->dentry is equal to new_dentry->d_parent */
+	allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry,
+						old_dentry->d_parent,
+						&layer_masks_parent1);
+	allow_parent2 = collect_domain_accesses(
+		dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2);
+
+	if (allow_parent1 && allow_parent2)
+		return 0;
+
+	/*
+	 * To be able to compare source and destination domain access rights,
+	 * take into account the @old_dentry access rights aggregated with its
+	 * parent access rights.  This will be useful to compare with the
+	 * destination parent access rights.
+	 */
+	return check_access_path_dual(dom, &mnt_dir, access_request_parent1,
+				      &layer_masks_parent1, old_dentry,
+				      access_request_parent2,
+				      &layer_masks_parent2,
+				      exchange ? new_dentry : NULL);
+}
+
 /* Inode hooks */
 
 static void hook_inode_free_security(struct inode *const inode)
@@ -591,32 +1084,12 @@ static int hook_sb_pivotroot(const struct path *const old_path,
 
 /* Path hooks */
 
-/*
- * Creating multiple links or renaming may lead to privilege escalations if not
- * handled properly.  Indeed, we must be sure that the source doesn't gain more
- * privileges by being accessible from the destination.  This is getting more
- * complex when dealing with multiple layers.  The whole picture can be seen as
- * a multilayer partial ordering problem.  A future version of Landlock will
- * deal with that.
- */
 static int hook_path_link(struct dentry *const old_dentry,
 			  const struct path *const new_dir,
 			  struct dentry *const new_dentry)
 {
-	const struct landlock_ruleset *const dom =
-		landlock_get_current_domain();
-
-	if (!dom)
-		return 0;
-	/* The mount points are the same for old and new paths, cf. EXDEV. */
-	if (old_dentry->d_parent != new_dir->dentry)
-		/* Gracefully forbids reparenting. */
-		return -EXDEV;
-	if (unlikely(d_is_negative(old_dentry)))
-		return -ENOENT;
-	return check_access_path(
-		dom, new_dir,
-		get_mode_access(d_backing_inode(old_dentry)->i_mode));
+	return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
+					false);
 }
 
 static int hook_path_rename(const struct path *const old_dir,
@@ -625,30 +1098,9 @@ static int hook_path_rename(const struct path *const old_dir,
 			    struct dentry *const new_dentry,
 			    const unsigned int flags)
 {
-	const struct landlock_ruleset *const dom =
-		landlock_get_current_domain();
-	u32 exchange_access = 0;
-
-	if (!dom)
-		return 0;
-	/* The mount points are the same for old and new paths, cf. EXDEV. */
-	if (old_dir->dentry != new_dir->dentry)
-		/* Gracefully forbids reparenting. */
-		return -EXDEV;
-	if (flags & RENAME_EXCHANGE) {
-		if (unlikely(d_is_negative(new_dentry)))
-			return -ENOENT;
-		exchange_access =
-			get_mode_access(d_backing_inode(new_dentry)->i_mode);
-	}
-	if (unlikely(d_is_negative(old_dentry)))
-		return -ENOENT;
-	/* RENAME_EXCHANGE is handled because directories are the same. */
-	return check_access_path(
-		dom, old_dir,
-		maybe_remove(old_dentry) | maybe_remove(new_dentry) |
-			exchange_access |
-			get_mode_access(d_backing_inode(old_dentry)->i_mode));
+	/* old_dir refers to old_dentry->d_parent and new_dir->mnt */
+	return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
+					!!(flags & RENAME_EXCHANGE));
 }
 
 static int hook_path_mkdir(const struct path *const dir,
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 17c2a2e7fe1e..b54184ab9439 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -18,7 +18,7 @@
 #define LANDLOCK_MAX_NUM_LAYERS		16
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
-#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_MAKE_SYM
+#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_REFER
 #define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_FS		__const_hweight64(LANDLOCK_MASK_ACCESS_FS)
 
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 507d43827afe..735a0865ea11 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -129,7 +129,7 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
-#define LANDLOCK_ABI_VERSION 1
+#define LANDLOCK_ABI_VERSION 2
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 35f64832b869..da9290817866 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -75,7 +75,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(1, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(2, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index a4fdcda62bde..69f9c7409198 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -401,7 +401,7 @@ TEST_F_FORK(layout1, inval)
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE)
 
-#define ACCESS_LAST LANDLOCK_ACCESS_FS_MAKE_SYM
+#define ACCESS_LAST LANDLOCK_ACCESS_FS_REFER
 
 #define ACCESS_ALL ( \
 	ACCESS_FILE | \
@@ -414,6 +414,7 @@ TEST_F_FORK(layout1, inval)
 	LANDLOCK_ACCESS_FS_MAKE_SOCK | \
 	LANDLOCK_ACCESS_FS_MAKE_FIFO | \
 	LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
+	LANDLOCK_ACCESS_FS_MAKE_SYM | \
 	ACCESS_LAST)
 
 /* clang-format on */
-- 
cgit v1.2.3


From 97e03f521050c092919591e668107b3d69c5f426 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:07 -0700
Subject: bpf: Add verifier support for dynptrs

This patch adds the bulk of the verifier work for supporting dynamic
pointers (dynptrs) in bpf.

A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure
defined internally as:

struct bpf_dynptr_kern {
    void *data;
    u32 size;
    u32 offset;
} __aligned(8);

The upper 8 bits of *size* is reserved (it contains extra metadata about
read-only status and dynptr type). Consequently, a dynptr only supports
memory less than 16 MB.

There are different types of dynptrs (eg malloc, ringbuf, ...). In this
patchset, the most basic one, dynptrs to a bpf program's local memory,
is added. For now only local memory that is of reg type PTR_TO_MAP_VALUE
is supported.

In the verifier, dynptr state information will be tracked in stack
slots. When the program passes in an uninitialized dynptr
(ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding
to the frame pointer where the dynptr resides at are marked
STACK_DYNPTR. For helper functions that take in initialized dynptrs (eg
bpf_dynptr_read + bpf_dynptr_write which are added later in this
patchset), the verifier enforces that the dynptr has been initialized
properly by checking that their corresponding stack slots have been
marked as STACK_DYNPTR.

The 6th patch in this patchset adds test cases that the verifier should
successfully reject, such as for example attempting to use a dynptr
after doing a direct write into it inside the bpf program.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-2-joannelkoong@gmail.com
---
 include/linux/bpf.h            |  28 ++++++
 include/linux/bpf_verifier.h   |  18 ++++
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/verifier.c          | 188 ++++++++++++++++++++++++++++++++++++++++-
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |   5 ++
 6 files changed, 243 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a9b1875212f6..b26c8176b9e0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -392,10 +392,15 @@ enum bpf_type_flag {
 
 	MEM_UNINIT		= BIT(7 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to memory local to the bpf program. */
+	DYNPTR_TYPE_LOCAL	= BIT(8 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
+#define DYNPTR_TYPE_FLAG_MASK	DYNPTR_TYPE_LOCAL
+
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
 
@@ -438,6 +443,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	ARG_PTR_TO_KPTR,	/* pointer to referenced kptr */
+	ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
 	__BPF_ARG_TYPE_MAX,
 
 	/* Extended arg_types. */
@@ -2376,4 +2382,26 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 **bin_buf, u32 num_args);
 void bpf_bprintf_cleanup(void);
 
+/* the implementation of the opaque uapi struct bpf_dynptr */
+struct bpf_dynptr_kern {
+	void *data;
+	/* Size represents the number of usable bytes of dynptr data.
+	 * If for example the offset is at 4 for a local dynptr whose data is
+	 * of type u64, the number of usable bytes is 4.
+	 *
+	 * The upper 8 bits are reserved. It is as follows:
+	 * Bits 0 - 23 = size
+	 * Bits 24 - 30 = dynptr type
+	 * Bit 31 = whether dynptr is read-only
+	 */
+	u32 size;
+	u32 offset;
+} __aligned(8);
+
+enum bpf_dynptr_type {
+	BPF_DYNPTR_TYPE_INVALID,
+	/* Points to memory that is local to the bpf program */
+	BPF_DYNPTR_TYPE_LOCAL,
+};
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1f1e7f2ea967..af5b2135215e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -72,6 +72,18 @@ struct bpf_reg_state {
 
 		u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
 
+		/* For dynptr stack slots */
+		struct {
+			enum bpf_dynptr_type type;
+			/* A dynptr is 16 bytes so it takes up 2 stack slots.
+			 * We need to track which slot is the first slot
+			 * to protect against cases where the user may try to
+			 * pass in an address starting at the second slot of the
+			 * dynptr.
+			 */
+			bool first_slot;
+		} dynptr;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -174,9 +186,15 @@ enum bpf_stack_slot_type {
 	STACK_SPILL,      /* register spilled into stack */
 	STACK_MISC,	  /* BPF program wrote some data into this slot */
 	STACK_ZERO,	  /* BPF program wrote constant zero */
+	/* A dynptr is stored in this stack slot. The type of dynptr
+	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
+	 */
+	STACK_DYNPTR,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+#define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
+#define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
 struct bpf_stack_state {
 	struct bpf_reg_state spilled_ptr;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 56688bee20d9..610944cb3389 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6528,6 +6528,11 @@ struct bpf_timer {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_dynptr {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45153cbc2bd6..1fd0b81c3fec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -259,6 +259,7 @@ struct bpf_call_arg_meta {
 	u32 ret_btf_id;
 	u32 subprogno;
 	struct bpf_map_value_off_desc *kptr_off_desc;
+	u8 uninit_dynptr_regno;
 };
 
 struct btf *btf_vmlinux;
@@ -581,6 +582,7 @@ static char slot_type_char[] = {
 	[STACK_SPILL]	= 'r',
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
+	[STACK_DYNPTR]	= 'd',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -596,6 +598,25 @@ static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
+static int get_spi(s32 off)
+{
+	return (-off - 1) / BPF_REG_SIZE;
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+	int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+	/* We need to check that slots between [spi - nr_slots + 1, spi] are
+	 * within [0, allocated_stack).
+	 *
+	 * Please note that the spi grows downwards. For example, a dynptr
+	 * takes the size of two stack slots; the first slot will be at
+	 * spi and the second slot will be at spi - 1.
+	 */
+	return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
 static struct bpf_func_state *func(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg)
 {
@@ -647,6 +668,108 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
 	env->scratched_stack_slots = ~0ULL;
 }
 
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+	case DYNPTR_TYPE_LOCAL:
+		return BPF_DYNPTR_TYPE_LOCAL;
+	default:
+		return BPF_DYNPTR_TYPE_INVALID;
+	}
+}
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   enum bpf_arg_type arg_type, int insn_idx)
+{
+	struct bpf_func_state *state = func(env, reg);
+	enum bpf_dynptr_type type;
+	int spi, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_DYNPTR;
+		state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+	}
+
+	type = arg_to_dynptr_type(arg_type);
+	if (type == BPF_DYNPTR_TYPE_INVALID)
+		return -EINVAL;
+
+	state->stack[spi].spilled_ptr.dynptr.first_slot = true;
+	state->stack[spi].spilled_ptr.dynptr.type = type;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+
+	return 0;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_INVALID;
+		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	}
+
+	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
+	state->stack[spi].spilled_ptr.dynptr.type = 0;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+
+	return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return true;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
+			return false;
+	}
+
+	return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				     enum bpf_arg_type arg_type)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
+		return false;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+			return false;
+	}
+
+	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+	if (arg_type == ARG_PTR_TO_DYNPTR)
+		return true;
+
+	return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -5400,6 +5523,11 @@ static bool arg_type_is_release(enum bpf_arg_type type)
 	return type & OBJ_RELEASE;
 }
 
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+	return base_type(type) == ARG_PTR_TO_DYNPTR;
+}
+
 static int int_ptr_type_to_size(enum bpf_arg_type type)
 {
 	if (type == ARG_PTR_TO_INT)
@@ -5539,6 +5667,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
+	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5628,8 +5757,13 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	bool fixed_off_ok = false;
 
 	switch ((u32)type) {
-	case SCALAR_VALUE:
 	/* Pointer types where reg offset is explicitly allowed: */
+	case PTR_TO_STACK:
+		if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
+			verbose(env, "cannot pass in dynptr at an offset\n");
+			return -EINVAL;
+		}
+		fallthrough;
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_MAP_KEY:
@@ -5639,7 +5773,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_MEM | MEM_ALLOC:
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
-	case PTR_TO_STACK:
+	case SCALAR_VALUE:
 		/* Some of the argument types nevertheless require a
 		 * zero register offset.
 		 */
@@ -5837,6 +5971,36 @@ skip_type_check:
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
+	} else if (arg_type_is_dynptr(arg_type)) {
+		if (arg_type & MEM_UNINIT) {
+			if (!is_dynptr_reg_valid_uninit(env, reg)) {
+				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+				return -EINVAL;
+			}
+
+			/* We only support one dynptr being uninitialized at the moment,
+			 * which is sufficient for the helper functions we have right now.
+			 */
+			if (meta->uninit_dynptr_regno) {
+				verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+				return -EFAULT;
+			}
+
+			meta->uninit_dynptr_regno = regno;
+		} else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+			const char *err_extra = "";
+
+			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+			case DYNPTR_TYPE_LOCAL:
+				err_extra = "local ";
+				break;
+			default:
+				break;
+			}
+			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+				err_extra, arg + 1);
+			return -EINVAL;
+		}
 	} else if (arg_type_is_alloc_size(arg_type)) {
 		if (!tnum_is_const(reg->var_off)) {
 			verbose(env, "R%d is not a known constant'\n",
@@ -6970,9 +7134,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
+	if (meta.uninit_dynptr_regno) {
+		/* we write BPF_DW bits (8 bytes) at a time */
+		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
+					      fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
+					      insn_idx);
+		if (err)
+			return err;
+	}
+
 	if (meta.release_regno) {
 		err = -EINVAL;
-		if (meta.ref_obj_id)
+		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+		else if (meta.ref_obj_id)
 			err = release_reference(env, meta.ref_obj_id);
 		/* meta.ref_obj_id can only be 0 if register that is meant to be
 		 * released is NULL, which must be > R0.
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index d5452f7eb996..855b937e7585 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -634,6 +634,7 @@ class PrinterHelpers(Printer):
             'struct file',
             'struct bpf_timer',
             'struct mptcp_sock',
+            'struct bpf_dynptr',
     ]
     known_types = {
             '...',
@@ -684,6 +685,7 @@ class PrinterHelpers(Printer):
             'struct file',
             'struct bpf_timer',
             'struct mptcp_sock',
+            'struct bpf_dynptr',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 56688bee20d9..610944cb3389 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6528,6 +6528,11 @@ struct bpf_timer {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_dynptr {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3


From 263ae152e96253f40c2c276faad8629e096b3bad Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:08 -0700
Subject: bpf: Add bpf_dynptr_from_mem for local dynptrs

This patch adds a new api bpf_dynptr_from_mem:

long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr);

which initializes a dynptr to point to a bpf program's local memory. For now
only local memory that is of reg type PTR_TO_MAP_VALUE is supported.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-3-joannelkoong@gmail.com
---
 include/uapi/linux/bpf.h       | 12 ++++++++
 kernel/bpf/helpers.c           | 65 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          |  6 ++++
 tools/include/uapi/linux/bpf.h | 12 ++++++++
 4 files changed, 95 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 610944cb3389..9be3644457dd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5178,6 +5178,17 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *mptcp_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Get a dynptr to local memory *data*.
+ *
+ *		*data* must be a ptr to a map value.
+ *		The maximum *size* supported is DYNPTR_MAX_SIZE.
+ *		*flags* is currently unused.
+ *	Return
+ *		0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
+ *		-EINVAL if flags is not 0.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5377,6 +5388,7 @@ union bpf_attr {
 	FN(kptr_xchg),			\
 	FN(map_lookup_percpu_elem),     \
 	FN(skc_to_mptcp_sock),		\
+	FN(dynptr_from_mem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bad96131a510..d3e935c2e25e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1412,6 +1412,69 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
 	.arg2_btf_id  = BPF_PTR_POISON,
 };
 
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT	28
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+	ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static int bpf_dynptr_check_size(u32 size)
+{
+	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+			    enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+	ptr->data = data;
+	ptr->offset = offset;
+	ptr->size = size;
+	bpf_dynptr_set_type(ptr, type);
+}
+
+static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+	memset(ptr, 0, sizeof(*ptr));
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+	int err;
+
+	err = bpf_dynptr_check_size(size);
+	if (err)
+		goto error;
+
+	/* flags is currently unsupported */
+	if (flags) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+	return 0;
+
+error:
+	bpf_dynptr_set_null(ptr);
+	return err;
+}
+
+const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+	.func		= bpf_dynptr_from_mem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1466,6 +1529,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_loop_proto;
 	case BPF_FUNC_strncmp:
 		return &bpf_strncmp_proto;
+	case BPF_FUNC_dynptr_from_mem:
+		return &bpf_dynptr_from_mem_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1fd0b81c3fec..b657d46f886e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7204,6 +7204,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
+	case BPF_FUNC_dynptr_from_mem:
+		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+			verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+				reg_type_str(env, regs[BPF_REG_1].type));
+			return -EACCES;
+		}
 	}
 
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 610944cb3389..9be3644457dd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5178,6 +5178,17 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *mptcp_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Get a dynptr to local memory *data*.
+ *
+ *		*data* must be a ptr to a map value.
+ *		The maximum *size* supported is DYNPTR_MAX_SIZE.
+ *		*flags* is currently unused.
+ *	Return
+ *		0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
+ *		-EINVAL if flags is not 0.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5377,6 +5388,7 @@ union bpf_attr {
 	FN(kptr_xchg),			\
 	FN(map_lookup_percpu_elem),     \
 	FN(skc_to_mptcp_sock),		\
+	FN(dynptr_from_mem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From bc34dee65a65e9c920c420005b8a43f2a721a458 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:09 -0700
Subject: bpf: Dynptr support for ring buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, our only way of writing dynamically-sized data into a ring
buffer is through bpf_ringbuf_output but this incurs an extra memcpy
cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra
memcpy, but it can only safely support reservation sizes that are
statically known since the verifier cannot guarantee that the bpf
program won’t access memory outside the reserved space.

The bpf_dynptr abstraction allows for dynamically-sized ring buffer
reservations without the extra memcpy.

There are 3 new APIs:

long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr);
void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags);
void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags);

These closely follow the functionalities of the original ringbuf APIs.
For example, all ringbuffer dynptrs that have been reserved must be
either submitted or discarded before the program exits.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-4-joannelkoong@gmail.com
---
 include/linux/bpf.h            | 15 +++++++-
 include/linux/bpf_verifier.h   |  2 ++
 include/uapi/linux/bpf.h       | 35 +++++++++++++++++++
 kernel/bpf/helpers.c           | 14 +++++---
 kernel/bpf/ringbuf.c           | 78 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 52 ++++++++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h | 35 +++++++++++++++++++
 7 files changed, 223 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b26c8176b9e0..c72321b6f306 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -395,11 +395,14 @@ enum bpf_type_flag {
 	/* DYNPTR points to memory local to the bpf program. */
 	DYNPTR_TYPE_LOCAL	= BIT(8 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to a ringbuf record. */
+	DYNPTR_TYPE_RINGBUF	= BIT(9 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	DYNPTR_TYPE_LOCAL
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -2231,6 +2234,9 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
 extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
 extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
 extern const struct bpf_func_proto bpf_ringbuf_query_proto;
+extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto;
+extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto;
+extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
@@ -2402,6 +2408,13 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_INVALID,
 	/* Points to memory that is local to the bpf program */
 	BPF_DYNPTR_TYPE_LOCAL,
+	/* Underlying data is a ringbuf record */
+	BPF_DYNPTR_TYPE_RINGBUF,
 };
 
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+		     enum bpf_dynptr_type type, u32 offset, u32 size);
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
+int bpf_dynptr_check_size(u32 size);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index af5b2135215e..e8439f6cbe57 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -100,6 +100,8 @@ struct bpf_reg_state {
 	 * for the purpose of tracking that it's freed.
 	 * For PTR_TO_SOCKET this is used to share which pointers retain the
 	 * same reference to the socket, to determine proper reference freeing.
+	 * For stack slots that are dynptrs, this is used to track references to
+	 * the dynptr to determine proper reference freeing.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9be3644457dd..081a55540aa5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5189,6 +5189,38 @@ union bpf_attr {
  *	Return
  *		0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
  *		-EINVAL if flags is not 0.
+ *
+ * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Reserve *size* bytes of payload in a ring buffer *ringbuf*
+ *		through the dynptr interface. *flags* must be 0.
+ *
+ *		Please note that a corresponding bpf_ringbuf_submit_dynptr or
+ *		bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the
+ *		reservation fails. This is enforced by the verifier.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ *	Description
+ *		Submit reserved ring buffer sample, pointed to by *data*,
+ *		through the dynptr interface. This is a no-op if the dynptr is
+ *		invalid/null.
+ *
+ *		For more information on *flags*, please see
+ *		'bpf_ringbuf_submit'.
+ *	Return
+ *		Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ *	Description
+ *		Discard reserved ring buffer sample through the dynptr
+ *		interface. This is a no-op if the dynptr is invalid/null.
+ *
+ *		For more information on *flags*, please see
+ *		'bpf_ringbuf_discard'.
+ *	Return
+ *		Nothing. Always succeeds.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5389,6 +5421,9 @@ union bpf_attr {
 	FN(map_lookup_percpu_elem),     \
 	FN(skc_to_mptcp_sock),		\
 	FN(dynptr_from_mem),		\
+	FN(ringbuf_reserve_dynptr),	\
+	FN(ringbuf_submit_dynptr),	\
+	FN(ringbuf_discard_dynptr),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d3e935c2e25e..abb08999ff56 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1423,13 +1423,13 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-static int bpf_dynptr_check_size(u32 size)
+int bpf_dynptr_check_size(u32 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
 }
 
-static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
-			    enum bpf_dynptr_type type, u32 offset, u32 size)
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+		     enum bpf_dynptr_type type, u32 offset, u32 size)
 {
 	ptr->data = data;
 	ptr->offset = offset;
@@ -1437,7 +1437,7 @@ static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 	bpf_dynptr_set_type(ptr, type);
 }
 
-static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 {
 	memset(ptr, 0, sizeof(*ptr));
 }
@@ -1523,6 +1523,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
+	case BPF_FUNC_ringbuf_reserve_dynptr:
+		return &bpf_ringbuf_reserve_dynptr_proto;
+	case BPF_FUNC_ringbuf_submit_dynptr:
+		return &bpf_ringbuf_submit_dynptr_proto;
+	case BPF_FUNC_ringbuf_discard_dynptr:
+		return &bpf_ringbuf_discard_dynptr_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 311264ab80c4..ded4faeca192 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+	   struct bpf_dynptr_kern *, ptr)
+{
+	struct bpf_ringbuf_map *rb_map;
+	void *sample;
+	int err;
+
+	if (unlikely(flags)) {
+		bpf_dynptr_set_null(ptr);
+		return -EINVAL;
+	}
+
+	err = bpf_dynptr_check_size(size);
+	if (err) {
+		bpf_dynptr_set_null(ptr);
+		return err;
+	}
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+	sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+	if (!sample) {
+		bpf_dynptr_set_null(ptr);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+	.func		= bpf_ringbuf_reserve_dynptr,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+	if (!ptr->data)
+		return 0;
+
+	bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+	bpf_dynptr_set_null(ptr);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+	.func		= bpf_ringbuf_submit_dynptr,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+	if (!ptr->data)
+		return 0;
+
+	bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+	bpf_dynptr_set_null(ptr);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+	.func		= bpf_ringbuf_discard_dynptr,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+	.arg2_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b657d46f886e..8be140351966 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem {
 					  POISON_POINTER_DELTA))
 #define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
 
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
 	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -673,17 +676,24 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
 	case DYNPTR_TYPE_LOCAL:
 		return BPF_DYNPTR_TYPE_LOCAL;
+	case DYNPTR_TYPE_RINGBUF:
+		return BPF_DYNPTR_TYPE_RINGBUF;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
 }
 
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+	return type == BPF_DYNPTR_TYPE_RINGBUF;
+}
+
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type type;
-	int spi, i;
+	int spi, i, id;
 
 	spi = get_spi(reg->off);
 
@@ -703,6 +713,16 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	state->stack[spi].spilled_ptr.dynptr.type = type;
 	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
 
+	if (dynptr_type_refcounted(type)) {
+		/* The id is used to track proper releasing */
+		id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
+
+		state->stack[spi].spilled_ptr.id = id;
+		state->stack[spi - 1].spilled_ptr.id = id;
+	}
+
 	return 0;
 }
 
@@ -721,6 +741,13 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
 	}
 
+	/* Invalidate any slices associated with this dynptr */
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		release_reference(env, state->stack[spi].spilled_ptr.id);
+		state->stack[spi].spilled_ptr.id = 0;
+		state->stack[spi - 1].spilled_ptr.id = 0;
+	}
+
 	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
 	state->stack[spi].spilled_ptr.dynptr.type = 0;
 	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
@@ -5859,7 +5886,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 
 skip_type_check:
 	if (arg_type_is_release(arg_type)) {
-		if (!reg->ref_obj_id && !register_is_null(reg)) {
+		if (arg_type_is_dynptr(arg_type)) {
+			struct bpf_func_state *state = func(env, reg);
+			int spi = get_spi(reg->off);
+
+			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+			    !state->stack[spi].spilled_ptr.id) {
+				verbose(env, "arg %d is an unacquired reference\n", regno);
+				return -EINVAL;
+			}
+		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
 			verbose(env, "R%d must be referenced when passed to release function\n",
 				regno);
 			return -EINVAL;
@@ -5994,9 +6030,13 @@ skip_type_check:
 			case DYNPTR_TYPE_LOCAL:
 				err_extra = "local ";
 				break;
+			case DYNPTR_TYPE_RINGBUF:
+				err_extra = "ringbuf ";
+				break;
 			default:
 				break;
 			}
+
 			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
 				err_extra, arg + 1);
 			return -EINVAL;
@@ -6122,7 +6162,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_MAP_TYPE_RINGBUF:
 		if (func_id != BPF_FUNC_ringbuf_output &&
 		    func_id != BPF_FUNC_ringbuf_reserve &&
-		    func_id != BPF_FUNC_ringbuf_query)
+		    func_id != BPF_FUNC_ringbuf_query &&
+		    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+		    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -6238,6 +6281,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_ringbuf_output:
 	case BPF_FUNC_ringbuf_reserve:
 	case BPF_FUNC_ringbuf_query:
+	case BPF_FUNC_ringbuf_reserve_dynptr:
+	case BPF_FUNC_ringbuf_submit_dynptr:
+	case BPF_FUNC_ringbuf_discard_dynptr:
 		if (map->map_type != BPF_MAP_TYPE_RINGBUF)
 			goto error;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9be3644457dd..081a55540aa5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5189,6 +5189,38 @@ union bpf_attr {
  *	Return
  *		0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
  *		-EINVAL if flags is not 0.
+ *
+ * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Reserve *size* bytes of payload in a ring buffer *ringbuf*
+ *		through the dynptr interface. *flags* must be 0.
+ *
+ *		Please note that a corresponding bpf_ringbuf_submit_dynptr or
+ *		bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the
+ *		reservation fails. This is enforced by the verifier.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ *	Description
+ *		Submit reserved ring buffer sample, pointed to by *data*,
+ *		through the dynptr interface. This is a no-op if the dynptr is
+ *		invalid/null.
+ *
+ *		For more information on *flags*, please see
+ *		'bpf_ringbuf_submit'.
+ *	Return
+ *		Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ *	Description
+ *		Discard reserved ring buffer sample through the dynptr
+ *		interface. This is a no-op if the dynptr is invalid/null.
+ *
+ *		For more information on *flags*, please see
+ *		'bpf_ringbuf_discard'.
+ *	Return
+ *		Nothing. Always succeeds.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5389,6 +5421,9 @@ union bpf_attr {
 	FN(map_lookup_percpu_elem),     \
 	FN(skc_to_mptcp_sock),		\
 	FN(dynptr_from_mem),		\
+	FN(ringbuf_reserve_dynptr),	\
+	FN(ringbuf_submit_dynptr),	\
+	FN(ringbuf_discard_dynptr),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 13bbbfbea7598ea9f8d9c3d73bf053bb57f9c4b2 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:10 -0700
Subject: bpf: Add bpf_dynptr_read and bpf_dynptr_write

This patch adds two helper functions, bpf_dynptr_read and
bpf_dynptr_write:

long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset);

long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len);

The dynptr passed into these functions must be valid dynptrs that have
been initialized.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-5-joannelkoong@gmail.com
---
 include/uapi/linux/bpf.h       | 19 ++++++++++
 kernel/bpf/helpers.c           | 78 ++++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 19 ++++++++++
 3 files changed, 116 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 081a55540aa5..efe2505650e6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5221,6 +5221,23 @@ union bpf_attr {
  *		'bpf_ringbuf_discard'.
  *	Return
  *		Nothing. Always succeeds.
+ *
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ *	Description
+ *		Read *len* bytes from *src* into *dst*, starting from *offset*
+ *		into *src*.
+ *	Return
+ *		0 on success, -E2BIG if *offset* + *len* exceeds the length
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ *	Description
+ *		Write *len* bytes from *src* into *dst*, starting from *offset*
+ *		into *dst*.
+ *	Return
+ *		0 on success, -E2BIG if *offset* + *len* exceeds the length
+ *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
+ *		is a read-only dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5424,6 +5441,8 @@ union bpf_attr {
 	FN(ringbuf_reserve_dynptr),	\
 	FN(ringbuf_submit_dynptr),	\
 	FN(ringbuf_discard_dynptr),	\
+	FN(dynptr_read),		\
+	FN(dynptr_write),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abb08999ff56..8cef3fb0d143 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1417,12 +1417,24 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
  */
 #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
 #define DYNPTR_TYPE_SHIFT	28
+#define DYNPTR_SIZE_MASK	0xFFFFFF
+#define DYNPTR_RDONLY_BIT	BIT(31)
+
+static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	return ptr->size & DYNPTR_RDONLY_BIT;
+}
 
 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
 {
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
+static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+{
+	return ptr->size & DYNPTR_SIZE_MASK;
+}
+
 int bpf_dynptr_check_size(u32 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
@@ -1442,6 +1454,16 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
+static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+{
+	u32 size = bpf_dynptr_get_size(ptr);
+
+	if (len > size || offset > size - len)
+		return -E2BIG;
+
+	return 0;
+}
+
 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
 {
 	int err;
@@ -1475,6 +1497,58 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
+BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+{
+	int err;
+
+	if (!src->data)
+		return -EINVAL;
+
+	err = bpf_dynptr_check_off_len(src, offset, len);
+	if (err)
+		return err;
+
+	memcpy(dst, src->data + src->offset + offset, len);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_read_proto = {
+	.func		= bpf_dynptr_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_DYNPTR,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+{
+	int err;
+
+	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+		return -EINVAL;
+
+	err = bpf_dynptr_check_off_len(dst, offset, len);
+	if (err)
+		return err;
+
+	memcpy(dst->data + dst->offset + offset, src, len);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_write_proto = {
+	.func		= bpf_dynptr_write,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1537,6 +1611,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_strncmp_proto;
 	case BPF_FUNC_dynptr_from_mem:
 		return &bpf_dynptr_from_mem_proto;
+	case BPF_FUNC_dynptr_read:
+		return &bpf_dynptr_read_proto;
+	case BPF_FUNC_dynptr_write:
+		return &bpf_dynptr_write_proto;
 	default:
 		break;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 081a55540aa5..efe2505650e6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5221,6 +5221,23 @@ union bpf_attr {
  *		'bpf_ringbuf_discard'.
  *	Return
  *		Nothing. Always succeeds.
+ *
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ *	Description
+ *		Read *len* bytes from *src* into *dst*, starting from *offset*
+ *		into *src*.
+ *	Return
+ *		0 on success, -E2BIG if *offset* + *len* exceeds the length
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ *	Description
+ *		Write *len* bytes from *src* into *dst*, starting from *offset*
+ *		into *dst*.
+ *	Return
+ *		0 on success, -E2BIG if *offset* + *len* exceeds the length
+ *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
+ *		is a read-only dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5424,6 +5441,8 @@ union bpf_attr {
 	FN(ringbuf_reserve_dynptr),	\
 	FN(ringbuf_submit_dynptr),	\
 	FN(ringbuf_discard_dynptr),	\
+	FN(dynptr_read),		\
+	FN(dynptr_write),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 34d4ef5775f776ec4b0d53a02d588bf3195cada6 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:11 -0700
Subject: bpf: Add dynptr data slices

This patch adds a new helper function

void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len);

which returns a pointer to the underlying data of a dynptr. *len*
must be a statically known value. The bpf program may access the returned
data slice as a normal buffer (eg can do direct reads and writes), since
the verifier associates the length with the returned pointer, and
enforces that no out of bounds accesses occur.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-6-joannelkoong@gmail.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 12 ++++++++++++
 kernel/bpf/helpers.c           | 28 ++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 23 +++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 12 ++++++++++++
 5 files changed, 76 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c72321b6f306..a7080c86fa76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -488,6 +488,7 @@ enum bpf_return_type {
 	RET_PTR_TO_TCP_SOCK_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
 	RET_PTR_TO_SOCK_COMMON_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
 	RET_PTR_TO_ALLOC_MEM_OR_NULL	= PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM,
+	RET_PTR_TO_DYNPTR_MEM_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
 	RET_PTR_TO_BTF_ID_OR_NULL	= PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
 
 	/* This must be the last entry. Its purpose is to ensure the enum is
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index efe2505650e6..f4009dbdf62d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5238,6 +5238,17 @@ union bpf_attr {
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr.
+ *
+ * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ *	Description
+ *		Get a pointer to the underlying dynptr data.
+ *
+ *		*len* must be a statically known value. The returned data slice
+ *		is invalidated whenever the dynptr is invalidated.
+ *	Return
+ *		Pointer to the underlying dynptr data, NULL if the dynptr is
+ *		read-only, if the dynptr is invalid, or if the offset and length
+ *		is out of bounds.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5443,6 +5454,7 @@ union bpf_attr {
 	FN(ringbuf_discard_dynptr),	\
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
+	FN(dynptr_data),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8cef3fb0d143..225806a02efb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1549,6 +1549,32 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+{
+	int err;
+
+	if (!ptr->data)
+		return 0;
+
+	err = bpf_dynptr_check_off_len(ptr, offset, len);
+	if (err)
+		return 0;
+
+	if (bpf_dynptr_is_rdonly(ptr))
+		return 0;
+
+	return (unsigned long)(ptr->data + ptr->offset + offset);
+}
+
+const struct bpf_func_proto bpf_dynptr_data_proto = {
+	.func		= bpf_dynptr_data,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1615,6 +1641,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_dynptr_read_proto;
 	case BPF_FUNC_dynptr_write:
 		return &bpf_dynptr_write_proto;
+	case BPF_FUNC_dynptr_data:
+		return &bpf_dynptr_data_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8be140351966..aedac2ac02b9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5832,6 +5832,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
+static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+
+	return state->stack[spi].spilled_ptr.id;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn)
@@ -7384,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].id = id;
 		/* For release_reference() */
 		regs[BPF_REG_0].ref_obj_id = id;
+	} else if (func_id == BPF_FUNC_dynptr_data) {
+		int dynptr_id = 0, i;
+
+		/* Find the id of the dynptr we're acquiring a reference to */
+		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+			if (arg_type_is_dynptr(fn->arg_type[i])) {
+				if (dynptr_id) {
+					verbose(env, "verifier internal error: multiple dynptr args in func\n");
+					return -EFAULT;
+				}
+				dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
+			}
+		}
+		/* For release_reference() */
+		regs[BPF_REG_0].ref_obj_id = dynptr_id;
 	}
 
 	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index efe2505650e6..f4009dbdf62d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5238,6 +5238,17 @@ union bpf_attr {
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
  *		is a read-only dynptr.
+ *
+ * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ *	Description
+ *		Get a pointer to the underlying dynptr data.
+ *
+ *		*len* must be a statically known value. The returned data slice
+ *		is invalidated whenever the dynptr is invalidated.
+ *	Return
+ *		Pointer to the underlying dynptr data, NULL if the dynptr is
+ *		read-only, if the dynptr is invalid, or if the offset and length
+ *		is out of bounds.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5443,6 +5454,7 @@ union bpf_attr {
 	FN(ringbuf_discard_dynptr),	\
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
+	FN(dynptr_data),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From e5499dd7253c8382d03f687f19a854adcc688357 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Wed, 25 May 2022 14:08:30 +0100
Subject: media: lirc: revert removal of unused feature flags

Commit b2a90f4fcb14 ("media: lirc: remove unused lirc features") removed
feature flags which were never implemented, but they are still used by
the lirc daemon went built from source.

Reinstate these symbols in order not to break the lirc build.

Fixes: b2a90f4fcb14 ("media: lirc: remove unused lirc features")
Link: https://lore.kernel.org/all/a0470450-ecfd-2918-e04a-7b57c1fd7694@kernel.org/
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/lirc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h
index 23b0f2c8ba81..8d7ca7c6af42 100644
--- a/include/uapi/linux/lirc.h
+++ b/include/uapi/linux/lirc.h
@@ -84,6 +84,13 @@
 #define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK)
 #define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK)
 
+/*
+ * Unused features. These features were never implemented, in tree or
+ * out of tree. These definitions are here so not to break the lircd build.
+ */
+#define LIRC_CAN_SET_REC_FILTER		0
+#define LIRC_CAN_NOTIFY_DECODE		0
+
 /*** IOCTL commands for lirc driver ***/
 
 #define LIRC_GET_FEATURES              _IOR('i', 0x00000000, __u32)
-- 
cgit v1.2.3


From caa28984163cb63ea0be4cb8dbf05defdc7303f9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 25 May 2022 09:02:19 -0700
Subject: linux/types.h: reinstate "__bitwise__" macro for user space use

Commit c724c866bb70 ("linux/types.h: remove unnecessary __bitwise__")
was right that there are no users of __bitwise__ in the kernel, but it
turns out there are user space users of it that do expect it.

It is, after all, in the uapi directory, so user space usage is to be
expected.

Instead of reverting the commit completely, let's just clarify the
situation so that it doesn't happen again, and have some in-code
explanations for why that "__bitwise__" still exists.

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Link: https://lore.kernel.org/all/b5c0a68d-8387-4909-beea-f70ab9e6e3d5@kernel.org/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index c4dc597f3dcf..308433be33c2 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -26,6 +26,9 @@
 #define __bitwise
 #endif
 
+/* The kernel doesn't use this legacy form, but user space does */
+#define __bitwise__ __bitwise
+
 typedef __u16 __bitwise __le16;
 typedef __u16 __bitwise __be16;
 typedef __u32 __bitwise __le32;
-- 
cgit v1.2.3


From a7c41b4687f5902af70cd559806990930c8a307b Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Mon, 30 May 2022 21:15:20 +0800
Subject: io_uring: let IORING_OP_FILES_UPDATE support choosing fixed file
 slots

One big issue with the file registration feature is that it needs user
space apps to maintain free slot info about io_uring's fixed file table,
which really is a burden for development. io_uring now supports choosing
free file slot for user space apps by using IORING_FILE_INDEX_ALLOC flag
in accept, open, and socket operations, but they need the app to use
direct accept or direct open, which not all apps are prepared to use yet.

To support apps that still need real fds, make use of the registration
feature easier. Let IORING_OP_FILES_UPDATE support choosing fixed file
slots, which will store picked fixed files slots in fd array and let cqe
return the number of slots allocated.

Suggested-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
[axboe: move flag to uapi io_uring header, change goto to break, init]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 72 +++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/io_uring.h |  6 ++++
 2 files changed, 68 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11524ea86f1f..ed3416a7b2e9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -574,6 +574,7 @@ struct io_close {
 	struct file			*file;
 	int				fd;
 	u32				file_slot;
+	u32				flags;
 };
 
 struct io_timeout_data {
@@ -1366,7 +1367,9 @@ static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index);
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
+static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+			    unsigned int offset);
+static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 static void io_eventfd_signal(struct io_ring_ctx *ctx);
@@ -5947,14 +5950,18 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
+	if (sqe->off || sqe->addr || sqe->len || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
 
 	req->close.fd = READ_ONCE(sqe->fd);
 	req->close.file_slot = READ_ONCE(sqe->file_index);
-	if (req->close.file_slot && req->close.fd)
+	req->close.flags = READ_ONCE(sqe->close_flags);
+	if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT)
+		return -EINVAL;
+	if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) &&
+	    req->close.file_slot && req->close.fd)
 		return -EINVAL;
 
 	return 0;
@@ -5970,7 +5977,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (req->close.file_slot) {
 		ret = io_close_fixed(req, issue_flags);
-		goto err;
+		if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT))
+			goto err;
 	}
 
 	spin_lock(&files->file_lock);
@@ -8003,6 +8011,41 @@ static int io_files_update_prep(struct io_kiocb *req,
 	return 0;
 }
 
+static int io_files_update_with_index_alloc(struct io_kiocb *req,
+					    unsigned int issue_flags)
+{
+	__s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg);
+	unsigned int done;
+	struct file *file;
+	int ret, fd;
+
+	for (done = 0; done < req->rsrc_update.nr_args; done++) {
+		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		file = fget(fd);
+		if (!file) {
+			ret = -EBADF;
+			break;
+		}
+		ret = io_fixed_fd_install(req, issue_flags, file,
+					  IORING_FILE_INDEX_ALLOC);
+		if (ret < 0)
+			break;
+		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
+			ret = -EFAULT;
+			__io_close_fixed(req, issue_flags, ret);
+			break;
+		}
+	}
+
+	if (done)
+		return done;
+	return ret;
+}
+
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -8016,10 +8059,14 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.resv = 0;
 	up.resv2 = 0;
 
-	io_ring_submit_lock(ctx, issue_flags);
-	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
-					&up, req->rsrc_update.nr_args);
-	io_ring_submit_unlock(ctx, issue_flags);
+	if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) {
+		ret = io_files_update_with_index_alloc(req, issue_flags);
+	} else {
+		io_ring_submit_lock(ctx, issue_flags);
+		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+				&up, req->rsrc_update.nr_args);
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
 
 	if (ret < 0)
 		req_set_fail(req);
@@ -10183,9 +10230,9 @@ err:
 	return ret;
 }
 
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+			    unsigned int offset)
 {
-	unsigned int offset = req->close.file_slot - 1;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_fixed_file *file_slot;
 	struct file *file;
@@ -10222,6 +10269,11 @@ out:
 	return ret;
 }
 
+static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+{
+	return __io_close_fixed(req, issue_flags, req->close.file_slot - 1);
+}
+
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 53e7dae92e42..776e0278f9dd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -47,6 +47,7 @@ struct io_uring_sqe {
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
+		__u32		close_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -258,6 +259,11 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+/*
+ * close flags, store in sqe->close_flags
+ */
+#define IORING_CLOSE_FD_AND_FILE_SLOT	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
-- 
cgit v1.2.3


From 3e0b8f529c10037ae0b369fc892e524eae5a5485 Mon Sep 17 00:00:00 2001
From: Arun Ajith S <aajith@arista.com>
Date: Mon, 30 May 2022 10:14:14 +0000
Subject: net/ipv6: Expand and rename accept_unsolicited_na to
 accept_untracked_na

RFC 9131 changes default behaviour of handling RX of NA messages when the
corresponding entry is absent in the neighbour cache. The current
implementation is limited to accept just unsolicited NAs. However, the
RFC is more generic where it also accepts solicited NAs. Both types
should result in adding a STALE entry for this case.

Expand accept_untracked_na behaviour to also accept solicited NAs to
be compliant with the RFC and rename the sysctl knob to
accept_untracked_na.

Fixes: f9a2fb73318e ("net/ipv6: Introduce accept_unsolicited_na knob to implement router-side changes for RFC9131")
Signed-off-by: Arun Ajith S <aajith@arista.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220530101414.65439-1-aajith@arista.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst             | 23 +++++-------
 include/linux/ipv6.h                               |  2 +-
 include/uapi/linux/ipv6.h                          |  2 +-
 net/ipv6/addrconf.c                                |  6 ++--
 net/ipv6/ndisc.c                                   | 42 +++++++++++++---------
 .../selftests/net/ndisc_unsolicited_na_test.sh     | 23 ++++++------
 6 files changed, 50 insertions(+), 48 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index b882d4238581..04216564a03c 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2474,21 +2474,16 @@ drop_unsolicited_na - BOOLEAN
 
 	By default this is turned off.
 
-accept_unsolicited_na - BOOLEAN
-	Add a new neighbour cache entry in STALE state for routers on receiving an
-	unsolicited neighbour advertisement with target link-layer address option
-	specified. This is as per router-side behavior documented in RFC9131.
-	This has lower precedence than drop_unsolicited_na.
+accept_untracked_na - BOOLEAN
+	Add a new neighbour cache entry in STALE state for routers on receiving a
+	neighbour advertisement (either solicited or unsolicited) with target
+	link-layer address option specified if no neighbour entry is already
+	present for the advertised IPv6 address. Without this knob, NAs received
+	for untracked addresses (absent in neighbour cache) are silently ignored.
+
+	This is as per router-side behaviour documented in RFC9131.
 
-	 ====   ======  ======  ==============================================
-	 drop   accept  fwding                   behaviour
-	 ----   ------  ------  ----------------------------------------------
-	    1        X       X  Drop NA packet and don't pass up the stack
-	    0        0       X  Pass NA packet up the stack, don't update NC
-	    0        1       0  Pass NA packet up the stack, don't update NC
-	    0        1       1  Pass NA packet up the stack, and add a STALE
-	                        NC entry
-	 ====   ======  ======  ==============================================
+	This has lower precedence than drop_unsolicited_na.
 
 	This will optimize the return path for the initial off-link communication
 	that is initiated by a directly connected host, by ensuring that
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 38c8203d52cb..37dfdcfcdd54 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -61,7 +61,7 @@ struct ipv6_devconf {
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
 	__s32		drop_unsolicited_na;
-	__s32		accept_unsolicited_na;
+	__s32		accept_untracked_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 549ddeaf788b..03cdbe798fe3 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -194,7 +194,7 @@ enum {
 	DEVCONF_IOAM6_ID,
 	DEVCONF_IOAM6_ID_WIDE,
 	DEVCONF_NDISC_EVICT_NOCARRIER,
-	DEVCONF_ACCEPT_UNSOLICITED_NA,
+	DEVCONF_ACCEPT_UNTRACKED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ca0aa744593e..1b1932502e9e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5586,7 +5586,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IOAM6_ID] = cnf->ioam6_id;
 	array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide;
 	array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier;
-	array[DEVCONF_ACCEPT_UNSOLICITED_NA] = cnf->accept_unsolicited_na;
+	array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -7038,8 +7038,8 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.extra2		= (void *)SYSCTL_ONE,
 	},
 	{
-		.procname	= "accept_unsolicited_na",
-		.data		= &ipv6_devconf.accept_unsolicited_na,
+		.procname	= "accept_untracked_na",
+		.data		= &ipv6_devconf.accept_untracked_na,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 254addad0dd3..b0dfe97ea4ee 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -979,7 +979,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
-	bool create_neigh;
+	u8 new_state;
 
 	if (skb->len < sizeof(struct nd_msg)) {
 		ND_PRINTK(2, warn, "NA: packet too short\n");
@@ -1000,7 +1000,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 	/* For some 802.11 wireless deployments (and possibly other networks),
 	 * there will be a NA proxy and unsolicitd packets are attacks
 	 * and thus should not be accepted.
-	 * drop_unsolicited_na takes precedence over accept_unsolicited_na
+	 * drop_unsolicited_na takes precedence over accept_untracked_na
 	 */
 	if (!msg->icmph.icmp6_solicited && idev &&
 	    idev->cnf.drop_unsolicited_na)
@@ -1041,25 +1041,33 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		in6_ifa_put(ifp);
 		return;
 	}
+
+	neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+
 	/* RFC 9131 updates original Neighbour Discovery RFC 4861.
-	 * An unsolicited NA can now create a neighbour cache entry
-	 * on routers if it has Target LL Address option.
+	 * NAs with Target LL Address option without a corresponding
+	 * entry in the neighbour cache can now create a STALE neighbour
+	 * cache entry on routers.
+	 *
+	 *   entry accept  fwding  solicited        behaviour
+	 * ------- ------  ------  ---------    ----------------------
+	 * present      X       X         0     Set state to STALE
+	 * present      X       X         1     Set state to REACHABLE
+	 *  absent      0       X         X     Do nothing
+	 *  absent      1       0         X     Do nothing
+	 *  absent      1       1         X     Add a new STALE entry
 	 *
-	 * drop   accept  fwding                   behaviour
-	 * ----   ------  ------  ----------------------------------------------
-	 *    1        X       X  Drop NA packet and don't pass up the stack
-	 *    0        0       X  Pass NA packet up the stack, don't update NC
-	 *    0        1       0  Pass NA packet up the stack, don't update NC
-	 *    0        1       1  Pass NA packet up the stack, and add a STALE
-	 *                          NC entry
 	 * Note that we don't do a (daddr == all-routers-mcast) check.
 	 */
-	create_neigh = !msg->icmph.icmp6_solicited && lladdr &&
-		       idev && idev->cnf.forwarding &&
-		       idev->cnf.accept_unsolicited_na;
-	neigh = __neigh_lookup(&nd_tbl, &msg->target, dev, create_neigh);
+	new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
+	if (!neigh && lladdr &&
+	    idev && idev->cnf.forwarding &&
+	    idev->cnf.accept_untracked_na) {
+		neigh = neigh_create(&nd_tbl, &msg->target, dev);
+		new_state = NUD_STALE;
+	}
 
-	if (neigh) {
+	if (neigh && !IS_ERR(neigh)) {
 		u8 old_flags = neigh->flags;
 		struct net *net = dev_net(dev);
 
@@ -1079,7 +1087,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		}
 
 		ndisc_update(dev, neigh, lladdr,
-			     msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+			     new_state,
 			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
 			     (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
 			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
diff --git a/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh
index f508657ee126..86e621b7b9c7 100755
--- a/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh
+++ b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh
@@ -1,15 +1,14 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-# This test is for the accept_unsolicited_na feature to
+# This test is for the accept_untracked_na feature to
 # enable RFC9131 behaviour. The following is the test-matrix.
 # drop   accept  fwding                   behaviour
 # ----   ------  ------  ----------------------------------------------
-#    1        X       X  Drop NA packet and don't pass up the stack
-#    0        0       X  Pass NA packet up the stack, don't update NC
-#    0        1       0  Pass NA packet up the stack, don't update NC
-#    0        1       1  Pass NA packet up the stack, and add a STALE
-#                           NC entry
+#    1        X       X  Don't update NC
+#    0        0       X  Don't update NC
+#    0        1       0  Don't update NC
+#    0        1       1  Add a STALE NC entry
 
 ret=0
 # Kselftest framework requirement - SKIP code is 4.
@@ -72,7 +71,7 @@ setup()
 	set -e
 
 	local drop_unsolicited_na=$1
-	local accept_unsolicited_na=$2
+	local accept_untracked_na=$2
 	local forwarding=$3
 
 	# Setup two namespaces and a veth tunnel across them.
@@ -93,7 +92,7 @@ setup()
 	${IP_ROUTER_EXEC} sysctl -qw \
                 ${ROUTER_CONF}.drop_unsolicited_na=${drop_unsolicited_na}
 	${IP_ROUTER_EXEC} sysctl -qw \
-                ${ROUTER_CONF}.accept_unsolicited_na=${accept_unsolicited_na}
+                ${ROUTER_CONF}.accept_untracked_na=${accept_untracked_na}
 	${IP_ROUTER_EXEC} sysctl -qw ${ROUTER_CONF}.disable_ipv6=0
 	${IP_ROUTER} addr add ${ROUTER_ADDR_WITH_MASK} dev ${ROUTER_INTF}
 
@@ -144,13 +143,13 @@ link_up() {
 
 verify_ndisc() {
 	local drop_unsolicited_na=$1
-	local accept_unsolicited_na=$2
+	local accept_untracked_na=$2
 	local forwarding=$3
 
 	neigh_show_output=$(${IP_ROUTER} neigh show \
                 to ${HOST_ADDR} dev ${ROUTER_INTF} nud stale)
 	if [ ${drop_unsolicited_na} -eq 0 ] && \
-			[ ${accept_unsolicited_na} -eq 1 ] && \
+			[ ${accept_untracked_na} -eq 1 ] && \
 			[ ${forwarding} -eq 1 ]; then
 		# Neighbour entry expected to be present for 011 case
 		[[ ${neigh_show_output} ]]
@@ -179,14 +178,14 @@ test_unsolicited_na_combination() {
 	test_unsolicited_na_common $1 $2 $3
 	test_msg=("test_unsolicited_na: "
 		"drop_unsolicited_na=$1 "
-		"accept_unsolicited_na=$2 "
+		"accept_untracked_na=$2 "
 		"forwarding=$3")
 	log_test $? 0 "${test_msg[*]}"
 	cleanup
 }
 
 test_unsolicited_na_combinations() {
-	# Args: drop_unsolicited_na accept_unsolicited_na forwarding
+	# Args: drop_unsolicited_na accept_untracked_na forwarding
 
 	# Expect entry
 	test_unsolicited_na_combination 0 1 1
-- 
cgit v1.2.3


From 13b00b135665c92065a27c0c39dd97e0f380bd4f Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 18 May 2022 16:38:00 +0300
Subject: vdpa: Add support for querying vendor statistics

Allows to read vendor statistics of a vdpa device. The specific
statistics data are received from the upstream driver in the form of an
(attribute name, attribute value) pairs.

An example of statistics for mlx5_vdpa device are:

received_desc - number of descriptors received by the virtqueue
completed_desc - number of descriptors completed by the virtqueue

A descriptor using indirect buffers is still counted as 1. In addition,
N chained descriptors are counted correctly N times as one would expect.

A new callback was added to vdpa_config_ops which provides the means for
the vdpa driver to return statistics results.

The interface allows for reading all the supported virtqueues, including
the control virtqueue if it exists.

Below are some examples taken from mlx5_vdpa which are introduced in the
following patch:

1. Read statistics for the virtqueue at index 1

$ vdpa dev vstats show vdpa-a qidx 1
vdpa-a:
queue_type tx queue_index 1 received_desc 3844836 completed_desc 3844836

2. Read statistics for the virtqueue at index 32
$ vdpa dev vstats show vdpa-a qidx 32
vdpa-a:
queue_type control_vq queue_index 32 received_desc 62 completed_desc 62

3. Read statisitics for the virtqueue at index 0 with json output
$ vdpa -j dev vstats show vdpa-a qidx 0
{"vstats":{"vdpa-a":{
"queue_type":"rx","queue_index":0,"name":"received_desc","value":417776,\
 "name":"completed_desc","value":417548}}}

4. Read statistics for the virtqueue at index 0 with preety json output
$ vdpa -jp dev vstats show vdpa-a qidx 0
{
    "vstats": {
        "vdpa-a": {

            "queue_type": "rx",
            "queue_index": 0,
            "name": "received_desc",
            "value": 417776,
            "name": "completed_desc",
            "value": 417548
        }
    }
}

Signed-off-by: Eli Cohen <elic@nvidia.com>
Message-Id: <20220518133804.1075129-3-elic@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 162 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vdpa.h      |   3 +
 include/uapi/linux/vdpa.h |   6 ++
 3 files changed, 171 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index fac89a0d8178..31b5eb2c0778 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -914,6 +914,108 @@ out:
 	return err;
 }
 
+static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg,
+			       struct genl_info *info, u32 index)
+{
+	struct virtio_net_config config = {};
+	u64 features;
+	u16 max_vqp;
+	u8 status;
+	int err;
+
+	status = vdev->config->get_status(vdev);
+	if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
+		NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete");
+		return -EAGAIN;
+	}
+	vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));
+
+	max_vqp = le16_to_cpu(config.max_virtqueue_pairs);
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp))
+		return -EMSGSIZE;
+
+	features = vdev->config->get_driver_features(vdev);
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES,
+			      features, VDPA_ATTR_PAD))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index))
+		return -EMSGSIZE;
+
+	err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg,
+			     struct genl_info *info, u32 index)
+{
+	int err;
+
+	mutex_lock(&vdev->cf_mutex);
+	if (!vdev->config->get_vendor_vq_stats) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = vdpa_fill_stats_rec(vdev, msg, info, index);
+out:
+	mutex_unlock(&vdev->cf_mutex);
+	return err;
+}
+
+static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev,
+				      struct sk_buff *msg,
+				      struct genl_info *info, u32 index)
+{
+	u32 device_id;
+	void *hdr;
+	int err;
+	u32 portid = info->snd_portid;
+	u32 seq = info->snd_seq;
+	u32 flags = 0;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags,
+			  VDPA_CMD_DEV_VSTATS_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) {
+		err = -EMSGSIZE;
+		goto undo_msg;
+	}
+
+	device_id = vdev->config->get_device_id(vdev);
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) {
+		err = -EMSGSIZE;
+		goto undo_msg;
+	}
+
+	switch (device_id) {
+	case VIRTIO_ID_NET:
+		if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) {
+			NL_SET_ERR_MSG_MOD(info->extack, "queue index excceeds max value");
+			err = -ERANGE;
+			break;
+		}
+
+		err = vendor_stats_fill(vdev, msg, info, index);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+	genlmsg_end(msg, hdr);
+
+	return err;
+
+undo_msg:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
 static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct vdpa_device *vdev;
@@ -995,6 +1097,60 @@ vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *
 	return msg->len;
 }
 
+static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct vdpa_device *vdev;
+	struct sk_buff *msg;
+	const char *devname;
+	struct device *dev;
+	u32 index;
+	int err;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX])
+		return -EINVAL;
+
+	devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]);
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto dev_err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device");
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index);
+	if (err)
+		goto mdev_err;
+
+	err = genlmsg_reply(msg, info);
+
+	put_device(dev);
+	mutex_unlock(&vdpa_dev_mutex);
+
+	return err;
+
+mdev_err:
+	put_device(dev);
+dev_err:
+	nlmsg_free(msg);
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
@@ -1035,6 +1191,12 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_dev_config_get_doit,
 		.dumpit = vdpa_nl_cmd_dev_config_get_dumpit,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_VSTATS_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_stats_get_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 8943a209202e..2ae8443331e1 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -276,6 +276,9 @@ struct vdpa_config_ops {
 			    const struct vdpa_vq_state *state);
 	int (*get_vq_state)(struct vdpa_device *vdev, u16 idx,
 			    struct vdpa_vq_state *state);
+	int (*get_vendor_vq_stats)(struct vdpa_device *vdev, u16 idx,
+				   struct sk_buff *msg,
+				   struct netlink_ext_ack *extack);
 	struct vdpa_notification_area
 	(*get_vq_notification)(struct vdpa_device *vdev, u16 idx);
 	/* vq irq is not expected to be changed once DRIVER_OK is set */
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 1061d8d2d09d..25c55cab3d7c 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -18,6 +18,7 @@ enum vdpa_command {
 	VDPA_CMD_DEV_DEL,
 	VDPA_CMD_DEV_GET,		/* can dump */
 	VDPA_CMD_DEV_CONFIG_GET,	/* can dump */
+	VDPA_CMD_DEV_VSTATS_GET,
 };
 
 enum vdpa_attr {
@@ -46,6 +47,11 @@ enum vdpa_attr {
 	VDPA_ATTR_DEV_NEGOTIATED_FEATURES,	/* u64 */
 	VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,		/* u32 */
 	VDPA_ATTR_DEV_SUPPORTED_FEATURES,	/* u64 */
+
+	VDPA_ATTR_DEV_QUEUE_INDEX,              /* u32 */
+	VDPA_ATTR_DEV_VENDOR_ATTR_NAME,		/* string */
+	VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,        /* u64 */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3


From 175d493c3c3e09a3abaa843068fae0f0ad42c47e Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:41 +0530
Subject: vhost: move the backend feature bits to vhost_types.h

We should store feature bits in vhost_types.h as what has been done
for e.g VHOST_F_LOG_ALL.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-2-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/vhost.h       | 5 -----
 include/uapi/linux/vhost_types.h | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 5d99e7c242a2..8f7b4a95d6f9 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -89,11 +89,6 @@
 
 /* Set or get vhost backend capability */
 
-/* Use message type V2 */
-#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
-/* IOTLB can accept batching hints */
-#define VHOST_BACKEND_F_IOTLB_BATCH  0x2
-
 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
 #define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64)
 
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index f7f6a3a28977..76ee7016c501 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -153,4 +153,9 @@ struct vhost_vdpa_iova_range {
 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
 #define VHOST_NET_F_VIRTIO_NET_HDR 27
 
+/* Use message type V2 */
+#define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
+/* IOTLB can accept batching hints */
+#define VHOST_BACKEND_F_IOTLB_BATCH  0x2
+
 #endif
-- 
cgit v1.2.3


From 91233ad711866f4e375742d84ef3ed6aab9daa96 Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:49 +0530
Subject: vhost: support ASID in IOTLB API

This patches allows userspace to send ASID based IOTLB message to
vhost. This idea is to use the reserved u32 field in the existing V2
IOTLB message. Vhost device should advertise this capability via
VHOST_BACKEND_F_IOTLB_ASID backend feature.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-10-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c             |  5 ++++-
 drivers/vhost/vhost.c            | 23 ++++++++++++++++++-----
 drivers/vhost/vhost.h            |  4 ++--
 include/uapi/linux/vhost_types.h |  6 +++++-
 4 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 9202ff97ddb5..174c9e81df4e 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -870,7 +870,7 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 				 msg->perm);
 }
 
-static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
+static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
 					struct vhost_iotlb_msg *msg)
 {
 	struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
@@ -879,6 +879,9 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
 	struct vhost_iotlb *iotlb = v->iotlb;
 	int r = 0;
 
+	if (asid != 0)
+		return -EINVAL;
+
 	mutex_lock(&dev->mutex);
 
 	r = vhost_dev_check_owner(dev);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d02173fb290c..d1e58f976f6e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -468,7 +468,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue **vqs, int nvqs,
 		    int iov_limit, int weight, int byte_weight,
 		    bool use_worker,
-		    int (*msg_handler)(struct vhost_dev *dev,
+		    int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 				       struct vhost_iotlb_msg *msg))
 {
 	struct vhost_virtqueue *vq;
@@ -1090,11 +1090,14 @@ static bool umem_access_ok(u64 uaddr, u64 size, int access)
 	return true;
 }
 
-static int vhost_process_iotlb_msg(struct vhost_dev *dev,
+static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
 				   struct vhost_iotlb_msg *msg)
 {
 	int ret = 0;
 
+	if (asid != 0)
+		return -EINVAL;
+
 	mutex_lock(&dev->mutex);
 	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
@@ -1141,6 +1144,7 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 	struct vhost_iotlb_msg msg;
 	size_t offset;
 	int type, ret;
+	u32 asid = 0;
 
 	ret = copy_from_iter(&type, sizeof(type), from);
 	if (ret != sizeof(type)) {
@@ -1156,7 +1160,16 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 		offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
 		break;
 	case VHOST_IOTLB_MSG_V2:
-		offset = sizeof(__u32);
+		if (vhost_backend_has_feature(dev->vqs[0],
+					      VHOST_BACKEND_F_IOTLB_ASID)) {
+			ret = copy_from_iter(&asid, sizeof(asid), from);
+			if (ret != sizeof(asid)) {
+				ret = -EINVAL;
+				goto done;
+			}
+			offset = sizeof(__u16);
+		} else
+			offset = sizeof(__u32);
 		break;
 	default:
 		ret = -EINVAL;
@@ -1178,9 +1191,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
 	}
 
 	if (dev->msg_handler)
-		ret = dev->msg_handler(dev, &msg);
+		ret = dev->msg_handler(dev, asid, &msg);
 	else
-		ret = vhost_process_iotlb_msg(dev, &msg);
+		ret = vhost_process_iotlb_msg(dev, asid, &msg);
 	if (ret) {
 		ret = -EFAULT;
 		goto done;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 638bb640d6b4..9f238d6c7b58 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -161,7 +161,7 @@ struct vhost_dev {
 	int byte_weight;
 	u64 kcov_handle;
 	bool use_worker;
-	int (*msg_handler)(struct vhost_dev *dev,
+	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 			   struct vhost_iotlb_msg *msg);
 };
 
@@ -169,7 +169,7 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
 		    int nvqs, int iov_limit, int weight, int byte_weight,
 		    bool use_worker,
-		    int (*msg_handler)(struct vhost_dev *dev,
+		    int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 				       struct vhost_iotlb_msg *msg));
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 76ee7016c501..634cee485abb 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -87,7 +87,7 @@ struct vhost_msg {
 
 struct vhost_msg_v2 {
 	__u32 type;
-	__u32 reserved;
+	__u32 asid;
 	union {
 		struct vhost_iotlb_msg iotlb;
 		__u8 padding[64];
@@ -157,5 +157,9 @@ struct vhost_vdpa_iova_range {
 #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1
 /* IOTLB can accept batching hints */
 #define VHOST_BACKEND_F_IOTLB_BATCH  0x2
+/* IOTLB can accept address space identifier through V2 type of IOTLB
+ * message
+ */
+#define VHOST_BACKEND_F_IOTLB_ASID  0x3
 
 #endif
-- 
cgit v1.2.3


From 3ace88bd37436abc84906312146fe5158a469142 Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:51 +0530
Subject: vhost-vdpa: introduce uAPI to get the number of virtqueue groups

Follows the vDPA support for multiple address spaces, this patch
introduce uAPI for the userspace to know the number of virtqueue
groups supported by the vDPA device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-12-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 4 ++++
 include/uapi/linux/vhost.h | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index cd1bee536c46..92f78df0f685 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -559,6 +559,10 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	case VHOST_VDPA_GET_VRING_NUM:
 		r = vhost_vdpa_get_vring_num(v, argp);
 		break;
+	case VHOST_VDPA_GET_GROUP_NUM:
+		r = copy_to_user(argp, &v->vdpa->ngroups,
+				 sizeof(v->vdpa->ngroups));
+		break;
 	case VHOST_SET_LOG_BASE:
 	case VHOST_SET_LOG_FD:
 		r = -ENOIOCTLCMD;
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 8f7b4a95d6f9..61317c61d768 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -145,11 +145,13 @@
 /* Get the valid iova range */
 #define VHOST_VDPA_GET_IOVA_RANGE	_IOR(VHOST_VIRTIO, 0x78, \
 					     struct vhost_vdpa_iova_range)
-
 /* Get the config size */
 #define VHOST_VDPA_GET_CONFIG_SIZE	_IOR(VHOST_VIRTIO, 0x79, __u32)
 
 /* Get the count of all virtqueues */
 #define VHOST_VDPA_GET_VQS_COUNT	_IOR(VHOST_VIRTIO, 0x80, __u32)
 
+/* Get the number of virtqueue groups. */
+#define VHOST_VDPA_GET_GROUP_NUM	_IOR(VHOST_VIRTIO, 0x81, __u32)
+
 #endif
-- 
cgit v1.2.3


From a0c95f201170bd559737d3cdc8a950aea62f29c6 Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:52 +0530
Subject: vhost-vdpa: introduce uAPI to get the number of address spaces

This patch introduces the uAPI for getting the number of address
spaces supported by this vDPA device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-13-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 3 +++
 include/uapi/linux/vhost.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 92f78df0f685..a017011ad1f5 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -563,6 +563,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 		r = copy_to_user(argp, &v->vdpa->ngroups,
 				 sizeof(v->vdpa->ngroups));
 		break;
+	case VHOST_VDPA_GET_AS_NUM:
+		r = copy_to_user(argp, &v->vdpa->nas, sizeof(v->vdpa->nas));
+		break;
 	case VHOST_SET_LOG_BASE:
 	case VHOST_SET_LOG_FD:
 		r = -ENOIOCTLCMD;
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 61317c61d768..51322008901a 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -154,4 +154,6 @@
 /* Get the number of virtqueue groups. */
 #define VHOST_VDPA_GET_GROUP_NUM	_IOR(VHOST_VIRTIO, 0x81, __u32)
 
+/* Get the number of address spaces. */
+#define VHOST_VDPA_GET_AS_NUM		_IOR(VHOST_VIRTIO, 0x7A, unsigned int)
 #endif
-- 
cgit v1.2.3


From 2d1fcb7758e49fd9caf150f3c70804b95b2ce80c Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:53 +0530
Subject: vhost-vdpa: uAPI to get virtqueue group id

Follows the support for virtqueue group in vDPA. This patches
introduces uAPI to get the virtqueue group ID for a specific virtqueue
in vhost-vdpa.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-14-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 8 ++++++++
 include/uapi/linux/vhost.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index a017011ad1f5..aa5cacdc5263 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -465,6 +465,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			return -EFAULT;
 		ops->set_vq_ready(vdpa, idx, s.num);
 		return 0;
+	case VHOST_VDPA_GET_VRING_GROUP:
+		s.index = idx;
+		s.num = ops->get_vq_group(vdpa, idx);
+		if (s.num >= vdpa->ngroups)
+			return -EIO;
+		else if (copy_to_user(argp, &s, sizeof(s)))
+			return -EFAULT;
+		return 0;
 	case VHOST_GET_VRING_BASE:
 		r = ops->get_vq_state(v->vdpa, idx, &vq_state);
 		if (r)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 51322008901a..668914c87f74 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -156,4 +156,12 @@
 
 /* Get the number of address spaces. */
 #define VHOST_VDPA_GET_AS_NUM		_IOR(VHOST_VIRTIO, 0x7A, unsigned int)
+
+/* Get the group for a virtqueue: read index, write group in num,
+ * The virtqueue index is stored in the index field of
+ * vhost_vring_state. The group for this specific virtqueue is
+ * returned via num field of vhost_vring_state.
+ */
+#define VHOST_VDPA_GET_VRING_GROUP	_IOWR(VHOST_VIRTIO, 0x7B,	\
+					      struct vhost_vring_state)
 #endif
-- 
cgit v1.2.3


From 84d7c8fd3aade2fe79313003ed06ede431ec2a6d Mon Sep 17 00:00:00 2001
From: Gautam Dawar <gautam.dawar@xilinx.com>
Date: Wed, 30 Mar 2022 23:33:54 +0530
Subject: vhost-vdpa: introduce uAPI to set group ASID

Follows the vDPA support for associating ASID to a specific virtqueue
group. This patch adds a uAPI to support setting them from userspace.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Gautam Dawar <gdawar@xilinx.com>
Message-Id: <20220330180436.24644-15-gdawar@xilinx.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 8 ++++++++
 include/uapi/linux/vhost.h | 7 +++++++
 2 files changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index aa5cacdc5263..6c7ee0f18892 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -473,6 +473,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 		else if (copy_to_user(argp, &s, sizeof(s)))
 			return -EFAULT;
 		return 0;
+	case VHOST_VDPA_SET_GROUP_ASID:
+		if (copy_from_user(&s, argp, sizeof(s)))
+			return -EFAULT;
+		if (s.num >= vdpa->nas)
+			return -EINVAL;
+		if (!ops->set_group_asid)
+			return -EOPNOTSUPP;
+		return ops->set_group_asid(vdpa, idx, s.num);
 	case VHOST_GET_VRING_BASE:
 		r = ops->get_vq_state(v->vdpa, idx, &vq_state);
 		if (r)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 668914c87f74..cab645d4a645 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -164,4 +164,11 @@
  */
 #define VHOST_VDPA_GET_VRING_GROUP	_IOWR(VHOST_VIRTIO, 0x7B,	\
 					      struct vhost_vring_state)
+/* Set the ASID for a virtqueue group. The group index is stored in
+ * the index field of vhost_vring_state, the ASID associated with this
+ * group is stored at num field of vhost_vring_state.
+ */
+#define VHOST_VDPA_SET_GROUP_ASID	_IOW(VHOST_VIRTIO, 0x7C, \
+					     struct vhost_vring_state)
+
 #endif
-- 
cgit v1.2.3


From 35d02493dba1ae6386fac07072908717affc3ff8 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:21 +0000
Subject: KVM: s390: pv: Add query interface

Some of the query information is already available via sysfs but
having a IOCTL makes the information easier to retrieve.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-4-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-4-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h | 25 ++++++++++++++++
 2 files changed, 101 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 76ad6408cb2c..5859f243d287 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2224,6 +2224,42 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
 	return r;
 }
 
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+	ssize_t len_min;
+
+	switch (info->header.id) {
+	case KVM_PV_INFO_VM: {
+		len_min =  sizeof(info->header) + sizeof(info->vm);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		memcpy(info->vm.inst_calls_list,
+		       uv_info.inst_calls_list,
+		       sizeof(uv_info.inst_calls_list));
+
+		/* It's max cpuid not max cpus, so it's off by one */
+		info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+		info->vm.max_guests = uv_info.max_num_sec_conf;
+		info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+		info->vm.feature_indication = uv_info.uv_feature_indications;
+
+		return len_min;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
 static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 {
 	int r = 0;
@@ -2360,6 +2396,46 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 			     cmd->rc, cmd->rrc);
 		break;
 	}
+	case KVM_PV_INFO: {
+		struct kvm_s390_pv_info info = {};
+		ssize_t data_len;
+
+		/*
+		 * No need to check the VM protection here.
+		 *
+		 * Maybe user space wants to query some of the data
+		 * when the VM is still unprotected. If we see the
+		 * need to fence a new data command we can still
+		 * return an error in the info handler.
+		 */
+
+		r = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info.header)))
+			break;
+
+		r = -EINVAL;
+		if (info.header.len_max < sizeof(info.header))
+			break;
+
+		data_len = kvm_s390_handle_pv_info(&info);
+		if (data_len < 0) {
+			r = data_len;
+			break;
+		}
+		/*
+		 * If a data command struct is extended (multiple
+		 * times) this can be used to determine how much of it
+		 * is valid.
+		 */
+		info.header.len_written = data_len;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &info, data_len))
+			break;
+
+		r = 0;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..5a5f66026dd3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1660,6 +1660,30 @@ struct kvm_s390_pv_unp {
 	__u64 tweak;
 };
 
+enum pv_cmd_info_id {
+	KVM_PV_INFO_VM,
+};
+
+struct kvm_s390_pv_info_vm {
+	__u64 inst_calls_list[4];
+	__u64 max_cpus;
+	__u64 max_guests;
+	__u64 max_guest_addr;
+	__u64 feature_indication;
+};
+
+struct kvm_s390_pv_info_header {
+	__u32 id;
+	__u32 len_max;
+	__u32 len_written;
+	__u32 reserved;
+};
+
+struct kvm_s390_pv_info {
+	struct kvm_s390_pv_info_header header;
+	struct kvm_s390_pv_info_vm vm;
+};
+
 enum pv_cmd_id {
 	KVM_PV_ENABLE,
 	KVM_PV_DISABLE,
@@ -1668,6 +1692,7 @@ enum pv_cmd_id {
 	KVM_PV_VERIFY,
 	KVM_PV_PREP_RESET,
 	KVM_PV_UNSHARE_ALL,
+	KVM_PV_INFO,
 };
 
 struct kvm_pv_cmd {
-- 
cgit v1.2.3


From fe9a93e07ba4f29def2f8a4318b63e0c70a5c6c2 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:23 +0000
Subject: KVM: s390: pv: Add query dump information

The dump API requires userspace to provide buffers into which we will
store data. The dump information added in this patch tells userspace
how big those buffers need to be.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-6-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-6-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 11 +++++++++++
 include/uapi/linux/kvm.h | 12 +++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5859f243d287..de54f14e081e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2255,6 +2255,17 @@ static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
 
 		return len_min;
 	}
+	case KVM_PV_INFO_DUMP: {
+		len_min =  sizeof(info->header) + sizeof(info->dump);
+
+		if (info->header.len_max < len_min)
+			return -EINVAL;
+
+		info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+		info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+		info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+		return len_min;
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5a5f66026dd3..065a05ec06b6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1662,6 +1662,13 @@ struct kvm_s390_pv_unp {
 
 enum pv_cmd_info_id {
 	KVM_PV_INFO_VM,
+	KVM_PV_INFO_DUMP,
+};
+
+struct kvm_s390_pv_info_dump {
+	__u64 dump_cpu_buffer_len;
+	__u64 dump_config_mem_buffer_per_1m;
+	__u64 dump_config_finalize_len;
 };
 
 struct kvm_s390_pv_info_vm {
@@ -1681,7 +1688,10 @@ struct kvm_s390_pv_info_header {
 
 struct kvm_s390_pv_info {
 	struct kvm_s390_pv_info_header header;
-	struct kvm_s390_pv_info_vm vm;
+	union {
+		struct kvm_s390_pv_info_dump dump;
+		struct kvm_s390_pv_info_vm vm;
+	};
 };
 
 enum pv_cmd_id {
-- 
cgit v1.2.3


From 0460eb35b443f73f8a8e3be1ea87bd690a852e20 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:24 +0000
Subject: KVM: s390: Add configuration dump functionality

Sometimes dumping inside of a VM fails, is unavailable or doesn't
yield the required data. For these occasions we dump the VM from the
outside, writing memory and cpu data to a file.

Up to now PV guests only supported dumping from the inside of the
guest through dumpers like KDUMP. A PV guest can be dumped from the
hypervisor but the data will be stale and / or encrypted.

To get the actual state of the PV VM we need the help of the
Ultravisor who safeguards the VM state. New UV calls have been added
to initialize the dump, dump storage state data, dump cpu data and
complete the dump process. We expose these calls in this patch via a
new UV ioctl command.

The sensitive parts of the dump data are encrypted, the dump key is
derived from the Customer Communication Key (CCK). This ensures that
only the owner of the VM who has the CCK can decrypt the dump data.

The memory is dumped / read via a normal export call and a re-import
after the dump initialization is not needed (no re-encryption with a
dump key).

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-7-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-7-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   1 +
 arch/s390/kvm/kvm-s390.c         |  93 ++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h         |   4 +
 arch/s390/kvm/pv.c               | 182 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h         |  15 ++++
 5 files changed, 295 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 766028d54a3e..a0fbe4820e0a 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -923,6 +923,7 @@ struct kvm_s390_pv {
 	u64 guest_len;
 	unsigned long stor_base;
 	void *stor_var;
+	bool dumping;
 };
 
 struct kvm_arch{
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index de54f14e081e..1d00aead6bc5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2271,6 +2271,68 @@ static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
 	}
 }
 
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+			   struct kvm_s390_pv_dmp dmp)
+{
+	int r = -EINVAL;
+	void __user *result_buff = (void __user *)dmp.buff_addr;
+
+	switch (dmp.subcmd) {
+	case KVM_PV_DUMP_INIT: {
+		if (kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * Block SIE entry as concurrent dump UVCs could lead
+		 * to validities.
+		 */
+		kvm_s390_vcpu_block_all(kvm);
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+			     cmd->rc, cmd->rrc);
+		if (!r) {
+			kvm->arch.pv.dumping = true;
+		} else {
+			kvm_s390_vcpu_unblock_all(kvm);
+			r = -EINVAL;
+		}
+		break;
+	}
+	case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		/*
+		 * gaddr is an output parameter since we might stop
+		 * early. As dmp will be copied back in our caller, we
+		 * don't need to do it ourselves.
+		 */
+		r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+						&cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_DUMP_COMPLETE: {
+		if (!kvm->arch.pv.dumping)
+			break;
+
+		r = -EINVAL;
+		if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+			break;
+
+		r = kvm_s390_pv_dump_complete(kvm, result_buff,
+					      &cmd->rc, &cmd->rrc);
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 {
 	int r = 0;
@@ -2447,6 +2509,28 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 		r = 0;
 		break;
 	}
+	case KVM_PV_DUMP: {
+		struct kvm_s390_pv_dmp dmp;
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&dmp, argp, sizeof(dmp)))
+			break;
+
+		r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+		if (r)
+			break;
+
+		if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+			r = -EFAULT;
+			break;
+		}
+
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -4564,6 +4648,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	int rc;
 
+	/*
+	 * Running a VM while dumping always has the potential to
+	 * produce inconsistent dump data. But for PV vcpus a SIE
+	 * entry while dumping could also lead to a fatal validity
+	 * intercept which we absolutely want to avoid.
+	 */
+	if (vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
 	if (kvm_run->immediate_exit)
 		return -EINTR;
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 497d52a83c78..2c11eb5ba3ef 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,10 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
 int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 		       unsigned long tweak, u16 *rc, u16 *rrc);
 int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc);
 
 static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
 {
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index cc7c9599f43e..e9912113879c 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/minmax.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
 #include <asm/gmap.h>
@@ -298,3 +299,184 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
 		return -EINVAL;
 	return 0;
 }
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ *	   is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the cache fails
+ *  -EINVAL if gaddr is not aligned to 1MB
+ *  -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_stor_state uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+		.header.len = sizeof(uvcb),
+		.config_handle = kvm->arch.pv.handle,
+		.gaddr = *gaddr,
+		.dump_area_origin = 0,
+	};
+	const u64 increment_len = uv_info.conf_dump_storage_state_len;
+	size_t buff_kvm_size;
+	size_t size_done = 0;
+	u8 *buff_kvm = NULL;
+	int cc, ret;
+
+	ret = -EINVAL;
+	/* UV call processes 1MB guest storage chunks at a time */
+	if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+		goto out;
+
+	/*
+	 * We provide the storage state for 1MB chunks of guest
+	 * storage. The buffer will need to be aligned to
+	 * conf_dump_storage_state_len so we don't end on a partial
+	 * chunk.
+	 */
+	if (!buff_user_len ||
+	    !IS_ALIGNED(buff_user_len, increment_len))
+		goto out;
+
+	/*
+	 * Allocate a buffer from which we will later copy to the user
+	 * process. We don't want userspace to dictate our buffer size
+	 * so we limit it to DUMP_BUFF_LEN.
+	 */
+	ret = -ENOMEM;
+	buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+	buff_kvm = vzalloc(buff_kvm_size);
+	if (!buff_kvm)
+		goto out;
+
+	ret = 0;
+	uvcb.dump_area_origin = (u64)buff_kvm;
+	/* We will loop until the user buffer is filled or an error occurs */
+	do {
+		/* Get 1MB worth of guest storage state data */
+		cc = uv_call_sched(0, (u64)&uvcb);
+
+		/* All or nothing */
+		if (cc) {
+			ret = -EINVAL;
+			break;
+		}
+
+		size_done += increment_len;
+		uvcb.dump_area_origin += increment_len;
+		buff_user_len -= increment_len;
+		uvcb.gaddr += HPAGE_SIZE;
+
+		/* KVM Buffer full, time to copy to the process */
+		if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+			if (copy_to_user(buff_user, buff_kvm, size_done)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			buff_user += size_done;
+			size_done = 0;
+			uvcb.dump_area_origin = (u64)buff_kvm;
+		}
+	} while (buff_user_len);
+
+	/* Report back where we ended dumping */
+	*gaddr = uvcb.gaddr;
+
+	/* Lets only log errors, we don't want to spam */
+out:
+	if (ret)
+		KVM_UV_EVENT(kvm, 3,
+			     "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+			     uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	vfree(buff_kvm);
+
+	return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM if allocating the completion buffer fails
+ *  -EINVAL if the UV call fails, rc and rrc will be set in this case
+ *  -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+			      u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_complete complete = {
+		.header.len = sizeof(complete),
+		.header.cmd = UVC_CMD_DUMP_COMPLETE,
+		.config_handle = kvm_s390_pv_get_handle(kvm),
+	};
+	u64 *compl_data;
+	int ret;
+
+	/* Allocate dump area */
+	compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+	if (!compl_data)
+		return -ENOMEM;
+	complete.dump_area_origin = (u64)compl_data;
+
+	ret = uv_call_sched(0, (u64)&complete);
+	*rc = complete.header.rc;
+	*rrc = complete.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+		     complete.header.rc, complete.header.rrc);
+
+	if (!ret) {
+		/*
+		 * kvm_s390_pv_dealloc_vm() will also (mem)set
+		 * this to false on a reboot or other destroy
+		 * operation for this vm.
+		 */
+		kvm->arch.pv.dumping = false;
+		kvm_s390_vcpu_unblock_all(kvm);
+		ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+		if (ret)
+			ret = -EFAULT;
+	}
+	vfree(compl_data);
+	/* If the UVC returned an error, translate it to -EINVAL */
+	if (ret > 0)
+		ret = -EINVAL;
+	return ret;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 065a05ec06b6..673be2061c6c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1660,6 +1660,20 @@ struct kvm_s390_pv_unp {
 	__u64 tweak;
 };
 
+enum pv_cmd_dmp_id {
+	KVM_PV_DUMP_INIT,
+	KVM_PV_DUMP_CONFIG_STOR_STATE,
+	KVM_PV_DUMP_COMPLETE,
+};
+
+struct kvm_s390_pv_dmp {
+	__u64 subcmd;
+	__u64 buff_addr;
+	__u64 buff_len;
+	__u64 gaddr;		/* For dump storage state */
+	__u64 reserved[4];
+};
+
 enum pv_cmd_info_id {
 	KVM_PV_INFO_VM,
 	KVM_PV_INFO_DUMP,
@@ -1703,6 +1717,7 @@ enum pv_cmd_id {
 	KVM_PV_PREP_RESET,
 	KVM_PV_UNSHARE_ALL,
 	KVM_PV_INFO,
+	KVM_PV_DUMP,
 };
 
 struct kvm_pv_cmd {
-- 
cgit v1.2.3


From 8aba09588d2af37c6cc1a781b87d1d91ebf389ae Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:25 +0000
Subject: KVM: s390: Add CPU dump functionality

The previous patch introduced the per-VM dump functions now let's
focus on dumping the VCPU state via the newly introduced
KVM_S390_PV_CPU_COMMAND ioctl which mirrors the VM UV ioctl and can be
extended with new commands later.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-8-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-8-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h |  1 +
 arch/s390/kvm/pv.c       | 16 +++++++++++
 include/uapi/linux/kvm.h |  4 +++
 4 files changed, 90 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1d00aead6bc5..37be2a33edb5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5096,6 +5096,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
 	return -ENOIOCTLCMD;
 }
 
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+					struct kvm_pv_cmd *cmd)
+{
+	struct kvm_s390_pv_dmp dmp;
+	void *data;
+	int ret;
+
+	/* Dump initialization is a prerequisite */
+	if (!vcpu->kvm->arch.pv.dumping)
+		return -EINVAL;
+
+	if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+		return -EFAULT;
+
+	/* We only handle this subcmd right now */
+	if (dmp.subcmd != KVM_PV_DUMP_CPU)
+		return -EINVAL;
+
+	/* CPU dump length is the same as create cpu storage donation. */
+	if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+		return -EINVAL;
+
+	data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+	VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+		   vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+	if (ret)
+		ret = -EINVAL;
+
+	/* On success copy over the dump data */
+	if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+		ret = -EFAULT;
+
+	kvfree(data);
+	return ret;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -5260,6 +5302,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 					   irq_state.len);
 		break;
 	}
+	case KVM_S390_PV_CPU_COMMAND: {
+		struct kvm_pv_cmd cmd;
+
+		r = -EINVAL;
+		if (!is_prot_virt_host())
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&cmd, argp, sizeof(cmd)))
+			break;
+
+		r = -EINVAL;
+		if (cmd.flags)
+			break;
+
+		/* We only handle this cmd right now */
+		if (cmd.cmd != KVM_PV_DUMP)
+			break;
+
+		r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+		/* Always copy over UV rc / rrc data */
+		if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+				 sizeof(cmd.rc) + sizeof(cmd.rrc)))
+			r = -EFAULT;
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 2c11eb5ba3ef..dd01d989816f 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,7 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
 int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 		       unsigned long tweak, u16 *rc, u16 *rrc);
 int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
 int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
 				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
 int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index e9912113879c..b4a499b10b67 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -300,6 +300,22 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
 	return 0;
 }
 
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_dump_cpu uvcb = {
+		.header.cmd = UVC_CMD_DUMP_CPU,
+		.header.len = sizeof(uvcb),
+		.cpu_handle = vcpu->arch.pv.handle,
+		.dump_area_origin = (u64)buff,
+	};
+	int cc;
+
+	cc = uv_call_sched(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	return cc;
+}
+
 /* Size of the cache for the storage state dump data. 1MB for now */
 #define DUMP_BUFF_LEN HPAGE_SIZE
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 673be2061c6c..af5d254f8061 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1664,6 +1664,7 @@ enum pv_cmd_dmp_id {
 	KVM_PV_DUMP_INIT,
 	KVM_PV_DUMP_CONFIG_STOR_STATE,
 	KVM_PV_DUMP_COMPLETE,
+	KVM_PV_DUMP_CPU,
 };
 
 struct kvm_s390_pv_dmp {
@@ -2168,4 +2169,7 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 
+/* Available with KVM_CAP_S390_PROTECTED_DUMP */
+#define KVM_S390_PV_CPU_COMMAND	_IOWR(KVMIO, 0xd0, struct kvm_pv_cmd)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3


From e9bf3acb23f0a6e18438c35944d6cb618d16cf05 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Tue, 17 May 2022 16:36:26 +0000
Subject: KVM: s390: Add KVM_CAP_S390_PROTECTED_DUMP

The capability indicates dump support for protected VMs.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/r/20220517163629.3443-9-frankja@linux.ibm.com
Message-Id: <20220517163629.3443-9-frankja@linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 20 ++++++++++++++++++++
 include/uapi/linux/kvm.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 37be2a33edb5..d1a32eb3cf5d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -606,6 +606,26 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_PROTECTED:
 		r = is_prot_virt_host();
 		break;
+	case KVM_CAP_S390_PROTECTED_DUMP: {
+		u64 pv_cmds_dump[] = {
+			BIT_UVC_CMD_DUMP_INIT,
+			BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+			BIT_UVC_CMD_DUMP_CPU,
+			BIT_UVC_CMD_DUMP_COMPLETE,
+		};
+		int i;
+
+		r = is_prot_virt_host();
+
+		for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+			if (!test_bit_inv(pv_cmds_dump[i],
+					  (unsigned long *)&uv_info.inst_calls_list)) {
+				r = 0;
+				break;
+			}
+		}
+		break;
+	}
 	default:
 		r = 0;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index af5d254f8061..c4a32910b88a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1157,6 +1157,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_TSC_CONTROL 214
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
+#define KVM_CAP_S390_PROTECTED_DUMP 217
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 662ce1dc9caf493c309200edbe38d186f1ea20d0 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Wed, 1 Jun 2022 15:55:25 -0700
Subject: delayacct: track delays from write-protect copy

Delay accounting does not track the delay of write-protect copy.  When
tasks trigger many write-protect copys(include COW and unsharing of
anonymous pages[1]), it may spend a amount of time waiting for them.  To
get the delay of tasks in write-protect copy, could help users to evaluate
the impact of using KSM or fork() or GUP.

Also update tools/accounting/getdelays.c:

    / # ./getdelays -dl -p 231
    print delayacct stats ON
    listen forever
    PID     231

    CPU             count     real total  virtual total    delay total  delay average
                     6247     1859000000     2154070021     1674255063          0.268ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        0              0              0ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    COMPACT         count    delay total  delay average
                        3          72758              0ms
    WPCOPY          count    delay total  delay average
                     3635      271567604              0ms

[1] commit 31cc5bc4af70("mm: support GUP-triggered unsharing of anonymous pages")

Link: https://lkml.kernel.org/r/20220409014342.2505532-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Reviewed-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: wangyong <wang.yong12@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  5 ++++-
 include/linux/delayacct.h                     | 28 +++++++++++++++++++++++++++
 include/uapi/linux/taskstats.h                |  6 +++++-
 kernel/delayacct.c                            | 16 +++++++++++++++
 mm/hugetlb.c                                  |  8 ++++++++
 mm/memory.c                                   |  8 ++++++++
 tools/accounting/getdelays.c                  |  8 +++++++-
 7 files changed, 76 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 197fe319cbec..241d1a87f2cd 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -15,6 +15,7 @@ c) swapping in pages
 d) memory reclaim
 e) thrashing page cache
 f) direct compact
+g) write-protect copy
 
 and makes these statistics available to userspace through
 the taskstats interface.
@@ -48,7 +49,7 @@ this structure. See
 for a description of the fields pertaining to delay accounting.
 It will generally be in the form of counters returning the cumulative
 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact etc.
+cache, direct compact, write-protect copy etc.
 
 Taking the difference of two successive readings of a given
 counter (say cpu_delay_total) for a task will give the delay
@@ -117,6 +118,8 @@ Get sum of delays, since system boot, for all pids with tgid 5::
 	                    0              0              0ms
 	COMPACT         count    delay total  delay average
 	                    0              0              0ms
+        WPCOPY          count    delay total  delay average
+                            0              0              0ms
 
 Get IO accounting for pid 1, it works only with -p::
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6b16a6930a19..58aea2d7385c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -45,9 +45,13 @@ struct task_delay_info {
 	u64 compact_start;
 	u64 compact_delay;	/* wait for memory compact */
 
+	u64 wpcopy_start;
+	u64 wpcopy_delay;	/* wait for write-protect copy */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
+	u32 wpcopy_count;	/* total count of write-protect copy */
 };
 #endif
 
@@ -75,6 +79,8 @@ extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
 extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
+extern void __delayacct_wpcopy_start(void);
+extern void __delayacct_wpcopy_end(void);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -191,6 +197,24 @@ static inline void delayacct_compact_end(void)
 		__delayacct_compact_end();
 }
 
+static inline void delayacct_wpcopy_start(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_wpcopy_start();
+}
+
+static inline void delayacct_wpcopy_end(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_wpcopy_end();
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -225,6 +249,10 @@ static inline void delayacct_compact_start(void)
 {}
 static inline void delayacct_compact_end(void)
 {}
+static inline void delayacct_wpcopy_start(void)
+{}
+static inline void delayacct_wpcopy_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 736154171489..a7f5b11a8f1b 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	12
+#define TASKSTATS_VERSION	13
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -194,6 +194,10 @@ struct taskstats {
 	__u64   ac_exe_dev;     /* program binary device ID */
 	__u64   ac_exe_inode;   /* program binary inode number */
 	/* v12 end */
+
+	/* v13: Delay waiting for write-protect copy */
+	__u64    wpcopy_count;
+	__u64    wpcopy_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2c1e18f7c5cf..164ed9ef77a3 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -177,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
 	tmp = d->compact_delay_total + tsk->delays->compact_delay;
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+	tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
+	d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
+	d->wpcopy_count += tsk->delays->wpcopy_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -249,3 +252,16 @@ void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count);
 }
+
+void __delayacct_wpcopy_start(void)
+{
+	current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+	delayacct_end(&current->delays->lock,
+		      &current->delays->wpcopy_start,
+		      &current->delays->wpcopy_delay,
+		      &current->delays->wpcopy_count);
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c468ac1d069..a57e1be41401 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@
 #include <linux/cma.h>
 #include <linux/migrate.h>
 #include <linux/nospec.h>
+#include <linux/delayacct.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -5230,6 +5231,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte = huge_ptep_get(ptep);
 	old_page = pte_page(pte);
 
+	delayacct_wpcopy_start();
+
 retry_avoidcopy:
 	/*
 	 * If no-one else is actually using this page, we're the exclusive
@@ -5240,6 +5243,8 @@ retry_avoidcopy:
 			page_move_anon_rmap(old_page, vma);
 		if (likely(!unshare))
 			set_huge_ptep_writable(vma, haddr, ptep);
+
+		delayacct_wpcopy_end();
 		return 0;
 	}
 	VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
@@ -5309,6 +5314,7 @@ retry_avoidcopy:
 			 * race occurs while re-acquiring page table
 			 * lock, and our job is done.
 			 */
+			delayacct_wpcopy_end();
 			return 0;
 		}
 
@@ -5367,6 +5373,8 @@ out_release_old:
 	put_page(old_page);
 
 	spin_lock(ptl); /* Caller expects lock to be held */
+
+	delayacct_wpcopy_end();
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 21dadf03f089..7a089145cad4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3090,6 +3090,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	int page_copied = 0;
 	struct mmu_notifier_range range;
 
+	delayacct_wpcopy_start();
+
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 
@@ -3114,6 +3116,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			put_page(new_page);
 			if (old_page)
 				put_page(old_page);
+
+			delayacct_wpcopy_end();
 			return 0;
 		}
 	}
@@ -3220,12 +3224,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			free_swap_cache(old_page);
 		put_page(old_page);
 	}
+
+	delayacct_wpcopy_end();
 	return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
 oom_free_new:
 	put_page(new_page);
 oom:
 	if (old_page)
 		put_page(old_page);
+
+	delayacct_wpcopy_end();
 	return VM_FAULT_OOM;
 }
 
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 11e86739456d..e83e6e47a21e 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -207,6 +207,8 @@ static void print_delayacct(struct taskstats *t)
 	       "THRASHING%12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n"
 	       "COMPACT  %12s%15s%15s\n"
+	       "      %15llu%15llu%15llums\n"
+	       "WPCOPY   %12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
@@ -234,7 +236,11 @@ static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
-	       average_ms(t->compact_delay_total, t->compact_count));
+	       average_ms(t->compact_delay_total, t->compact_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->wpcopy_count,
+	       (unsigned long long)t->wpcopy_delay_total,
+	       average_ms(t->wpcopy_delay_total, t->wpcopy_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
cgit v1.2.3


From 8d3398ba2a0d1e25690f830192b7834acab003ec Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 31 May 2022 11:43:45 +0200
Subject: socket: Don't use u8 type in uapi socket.h

Use plain 255 instead, which also avoid introducing an additional header
dependency on <linux/types.h>

Fixes: 26859240e4ee ("txhash: Add socket option to control TX hash rethink behavior")
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Link: https://lore.kernel.org/r/20220531094345.13801-1-tklauser@distanz.ch
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/socket.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index 51d6bb2f6765..d3fcd3b5ec53 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -31,7 +31,7 @@ struct __kernel_sockaddr_storage {
 
 #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK)
 
-#define SOCK_TXREHASH_DEFAULT	((u8)-1)
+#define SOCK_TXREHASH_DEFAULT	255
 #define SOCK_TXREHASH_DISABLED	0
 #define SOCK_TXREHASH_ENABLED	1
 
-- 
cgit v1.2.3


From 8cc5b032240ae5220b62c689c20459d3e1825b2d Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Wed, 1 Jun 2022 01:00:17 +0000
Subject: binder: fix sender_euid type in uapi header

The {pid,uid}_t fields of struct binder_transaction were recently
replaced to use kernel types in commit 169adc2b6b3c ("android/binder.h:
add linux/android/binder(fs).h to UAPI compile-test coverage").

However, using __kernel_uid_t here breaks backwards compatibility in
architectures using 16-bits for this type, since glibc and some others
still expect a 32-bit uid_t. Instead, let's use __kernel_uid32_t which
avoids this compatibility problem.

Fixes: 169adc2b6b3c ("android/binder.h: add linux/android/binder(fs).h to UAPI compile-test coverage")
Reported-by: Christopher Ferris <cferris@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/uapi/linux/android/binder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 11157fae8a8e..688bcdaeed53 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -289,7 +289,7 @@ struct binder_transaction_data {
 	/* General information about the transaction. */
 	__u32	        flags;
 	__kernel_pid_t	sender_pid;
-	__kernel_uid_t	sender_euid;
+	__kernel_uid32_t	sender_euid;
 	binder_size_t	data_size;	/* number of bytes of data */
 	binder_size_t	offsets_size;	/* number of bytes of offsets */
 
-- 
cgit v1.2.3


From 08145b087e4481458f6075f3af58021a3cf8a940 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Tue, 31 May 2022 18:04:10 +0800
Subject: LoongArch: Add ELF-related definitions

Add ELF-related definitions for LoongArch, including: EM_LOONGARCH,
KEXEC_ARCH_LOONGARCH, AUDIT_ARCH_LOONGARCH32, AUDIT_ARCH_LOONGARCH64
and NT_LOONGARCH_*.

Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/uapi/linux/audit.h  | 2 ++
 include/uapi/linux/elf-em.h | 1 +
 include/uapi/linux/elf.h    | 5 +++++
 include/uapi/linux/kexec.h  | 1 +
 scripts/sorttable.c         | 5 +++++
 5 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 8eda133ca4c1..7c1dc818b1d5 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -439,6 +439,8 @@ enum {
 #define AUDIT_ARCH_UNICORE	(EM_UNICORE|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_XTENSA	(EM_XTENSA)
+#define AUDIT_ARCH_LOONGARCH32	(EM_LOONGARCH|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_LOONGARCH64	(EM_LOONGARCH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
 #define AUDIT_PERM_EXEC		1
 #define AUDIT_PERM_WRITE	2
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index f47e853546fa..ef38c2bc5ab7 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -51,6 +51,7 @@
 #define EM_RISCV	243	/* RISC-V */
 #define EM_BPF		247	/* Linux BPF - in-kernel virtual machine */
 #define EM_CSKY		252	/* C-SKY */
+#define EM_LOONGARCH	258	/* LoongArch */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
 
 /*
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index c4abd09c3da9..2b9f5e9985e5 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -438,6 +438,11 @@ typedef struct elf64_shdr {
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
 #define NT_MIPS_FP_MODE	0x801		/* MIPS floating-point mode */
 #define NT_MIPS_MSA	0x802		/* MIPS SIMD registers */
+#define NT_LOONGARCH_CPUCFG	0xa00	/* LoongArch CPU config registers */
+#define NT_LOONGARCH_CSR	0xa01	/* LoongArch control and status registers */
+#define NT_LOONGARCH_LSX	0xa02	/* LoongArch Loongson SIMD Extension registers */
+#define NT_LOONGARCH_LASX	0xa03	/* LoongArch Loongson Advanced SIMD Extension registers */
+#define NT_LOONGARCH_LBT	0xa04	/* LoongArch Loongson Binary Translation registers */
 
 /* Note types with note name "GNU" */
 #define NT_GNU_PROPERTY_TYPE_0	5
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index fb7e2ef60825..981016e05cfa 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -43,6 +43,7 @@
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
 #define KEXEC_ARCH_RISCV   (243 << 16)
+#define KEXEC_ARCH_LOONGARCH	(258 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
diff --git a/scripts/sorttable.c b/scripts/sorttable.c
index d00504c5f530..fba40e99f354 100644
--- a/scripts/sorttable.c
+++ b/scripts/sorttable.c
@@ -60,6 +60,10 @@
 #define EM_RISCV	243
 #endif
 
+#ifndef EM_LOONGARCH
+#define EM_LOONGARCH	258
+#endif
+
 static uint32_t (*r)(const uint32_t *);
 static uint16_t (*r2)(const uint16_t *);
 static uint64_t (*r8)(const uint64_t *);
@@ -313,6 +317,7 @@ static int do_file(char const *const fname, void *addr)
 	case EM_ARCOMPACT:
 	case EM_ARCV2:
 	case EM_ARM:
+	case EM_LOONGARCH:
 	case EM_MICROBLAZE:
 	case EM_MIPS:
 	case EM_XTENSA:
-- 
cgit v1.2.3


From 6089fb325cf737eeb2c4d236c94697112ca860da Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 6 Jun 2022 23:26:00 -0700
Subject: bpf: Add btf enum64 support

Currently, BTF only supports upto 32bit enum value with BTF_KIND_ENUM.
But in kernel, some enum indeed has 64bit values, e.g.,
in uapi bpf.h, we have
  enum {
        BPF_F_INDEX_MASK                = 0xffffffffULL,
        BPF_F_CURRENT_CPU               = BPF_F_INDEX_MASK,
        BPF_F_CTXLEN_MASK               = (0xfffffULL << 32),
  };
In this case, BTF_KIND_ENUM will encode the value of BPF_F_CTXLEN_MASK
as 0, which certainly is incorrect.

This patch added a new btf kind, BTF_KIND_ENUM64, which permits
64bit value to cover the above use case. The BTF_KIND_ENUM64 has
the following three fields followed by the common type:
  struct bpf_enum64 {
    __u32 nume_off;
    __u32 val_lo32;
    __u32 val_hi32;
  };
Currently, btf type section has an alignment of 4 as all element types
are u32. Representing the value with __u64 will introduce a pad
for bpf_enum64 and may also introduce misalignment for the 64bit value.
Hence, two members of val_hi32 and val_lo32 are chosen to avoid these issues.

The kflag is also introduced for BTF_KIND_ENUM and BTF_KIND_ENUM64
to indicate whether the value is signed or unsigned. The kflag intends
to provide consistent output of BTF C fortmat with the original
source code. For example, the original BTF_KIND_ENUM bit value is 0xffffffff.
The format C has two choices, printing out 0xffffffff or -1 and current libbpf
prints out as unsigned value. But if the signedness is preserved in btf,
the value can be printed the same as the original source code.
The kflag value 0 means unsigned values, which is consistent to the default
by libbpf and should also cover most cases as well.

The new BTF_KIND_ENUM64 is intended to support the enum value represented as
64bit value. But it can represent all BTF_KIND_ENUM values as well.
The compiler ([1]) and pahole will generate BTF_KIND_ENUM64 only if the value has
to be represented with 64 bits.

In addition, a static inline function btf_kind_core_compat() is introduced which
will be used later when libbpf relo_core.c changed. Here the kernel shares the
same relo_core.c with libbpf.

  [1] https://reviews.llvm.org/D124641

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220607062600.3716578-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h            |  28 ++++++++
 include/uapi/linux/btf.h       |  17 ++++-
 kernel/bpf/btf.c               | 142 +++++++++++++++++++++++++++++++++++++----
 kernel/bpf/verifier.c          |   2 +-
 tools/include/uapi/linux/btf.h |  17 ++++-
 5 files changed, 185 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 2611cea2c2b6..1bfed7fa0428 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -177,6 +177,19 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_is_any_enum(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM ||
+	       BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64;
+}
+
+static inline bool btf_kind_core_compat(const struct btf_type *t1,
+					const struct btf_type *t2)
+{
+	return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) ||
+	       (btf_is_any_enum(t1) && btf_is_any_enum(t2));
+}
+
 static inline bool str_is_empty(const char *s)
 {
 	return !s || !s[0];
@@ -192,6 +205,16 @@ static inline bool btf_is_enum(const struct btf_type *t)
 	return btf_kind(t) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_is_enum64(const struct btf_type *t)
+{
+	return btf_kind(t) == BTF_KIND_ENUM64;
+}
+
+static inline u64 btf_enum64_value(const struct btf_enum64 *e)
+{
+	return ((u64)e->val_hi32 << 32) | e->val_lo32;
+}
+
 static inline bool btf_is_composite(const struct btf_type *t)
 {
 	u16 kind = btf_kind(t);
@@ -332,6 +355,11 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t)
 	return (struct btf_enum *)(t + 1);
 }
 
+static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
+{
+	return (struct btf_enum64 *)(t + 1);
+}
+
 static inline const struct btf_var_secinfo *btf_type_var_secinfo(
 		const struct btf_type *t)
 {
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index a9162a6c0284..ec1798b6d3ff 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -36,10 +36,10 @@ struct btf_type {
 	 * bits 24-28: kind (e.g. int, ptr, array...etc)
 	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
-	 *             struct, union and fwd
+	 *             struct, union, enum, fwd and enum64
 	 */
 	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC.
+	/* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64.
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
@@ -63,7 +63,7 @@ enum {
 	BTF_KIND_ARRAY		= 3,	/* Array	*/
 	BTF_KIND_STRUCT		= 4,	/* Struct	*/
 	BTF_KIND_UNION		= 5,	/* Union	*/
-	BTF_KIND_ENUM		= 6,	/* Enumeration	*/
+	BTF_KIND_ENUM		= 6,	/* Enumeration up to 32-bit values */
 	BTF_KIND_FWD		= 7,	/* Forward	*/
 	BTF_KIND_TYPEDEF	= 8,	/* Typedef	*/
 	BTF_KIND_VOLATILE	= 9,	/* Volatile	*/
@@ -76,6 +76,7 @@ enum {
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
 	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
+	BTF_KIND_ENUM64		= 19,	/* Enumeration up to 64-bit values */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
@@ -186,4 +187,14 @@ struct btf_decl_tag {
        __s32   component_idx;
 };
 
+/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64".
+ * The exact number of btf_enum64 is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum64 {
+	__u32	name_off;
+	__u32	val_lo32;
+	__u32	val_hi32;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7bccaa4646e5..6c0d8480e15c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT]	= "FLOAT",
 	[BTF_KIND_DECL_TAG]	= "DECL_TAG",
 	[BTF_KIND_TYPE_TAG]	= "TYPE_TAG",
+	[BTF_KIND_ENUM64]	= "ENUM64",
 };
 
 const char *btf_type_str(const struct btf_type *t)
@@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t)
 	case BTF_KIND_ENUM:
 	case BTF_KIND_DATASEC:
 	case BTF_KIND_FLOAT:
+	case BTF_KIND_ENUM64:
 		return true;
 	}
 
@@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
 	return (const struct btf_decl_tag *)(t + 1);
 }
 
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+	return (const struct btf_enum64 *)(t + 1);
+}
+
 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 {
 	return kind_ops[BTF_INFO_KIND(t->info)];
@@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show)
 			parens = "{";
 		break;
 	case BTF_KIND_ENUM:
+	case BTF_KIND_ENUM64:
 		prefix = "enum";
 		break;
 	default:
@@ -1834,6 +1842,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		case BTF_KIND_UNION:
 		case BTF_KIND_ENUM:
 		case BTF_KIND_FLOAT:
+		case BTF_KIND_ENUM64:
 			size = type->size;
 			goto resolved;
 
@@ -3670,6 +3679,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 {
 	const struct btf_enum *enums = btf_type_enum(t);
 	struct btf *btf = env->btf;
+	const char *fmt_str;
 	u16 i, nr_enums;
 	u32 meta_needed;
 
@@ -3683,11 +3693,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (btf_type_kflag(t)) {
-		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
-		return -EINVAL;
-	}
-
 	if (t->size > 8 || !is_power_of_2(t->size)) {
 		btf_verifier_log_type(env, t, "Unexpected size");
 		return -EINVAL;
@@ -3718,7 +3723,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 
 		if (env->log.level == BPF_LOG_KERNEL)
 			continue;
-		btf_verifier_log(env, "\t%s val=%d\n",
+		fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+		btf_verifier_log(env, fmt_str,
 				 __btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
@@ -3759,7 +3765,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
 		return;
 	}
 
-	btf_show_type_value(show, "%d", v);
+	if (btf_type_kflag(t))
+		btf_show_type_value(show, "%d", v);
+	else
+		btf_show_type_value(show, "%u", v);
 	btf_show_end_type(show);
 }
 
@@ -3772,6 +3781,109 @@ static struct btf_kind_operations enum_ops = {
 	.show = btf_enum_show,
 };
 
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	const struct btf_enum64 *enums = btf_type_enum64(t);
+	struct btf *btf = env->btf;
+	const char *fmt_str;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size > 8 || !is_power_of_2(t->size)) {
+		btf_verifier_log_type(env, t, "Unexpected size");
+		return -EINVAL;
+	}
+
+	/* enum type either no name or a valid one */
+	if (t->name_off &&
+	    !btf_name_valid_identifier(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name_off);
+			return -EINVAL;
+		}
+
+		/* enum member must have a valid name */
+		if (!enums[i].name_off ||
+		    !btf_name_valid_identifier(btf, enums[i].name_off)) {
+			btf_verifier_log_type(env, t, "Invalid name");
+			return -EINVAL;
+		}
+
+		if (env->log.level == BPF_LOG_KERNEL)
+			continue;
+
+		fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+		btf_verifier_log(env, fmt_str,
+				 __btf_name_by_offset(btf, enums[i].name_off),
+				 btf_enum64_value(enums + i));
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offset,
+			    struct btf_show *show)
+{
+	const struct btf_enum64 *enums = btf_type_enum64(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	void *safe_data;
+	s64 v;
+
+	safe_data = btf_show_start_type(show, t, type_id, data);
+	if (!safe_data)
+		return;
+
+	v = *(u64 *)safe_data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v != btf_enum64_value(enums + i))
+			continue;
+
+		btf_show_type_value(show, "%s",
+				    __btf_name_by_offset(btf,
+							 enums[i].name_off));
+
+		btf_show_end_type(show);
+		return;
+	}
+
+	if (btf_type_kflag(t))
+		btf_show_type_value(show, "%lld", v);
+	else
+		btf_show_type_value(show, "%llu", v);
+	btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+	.check_meta = btf_enum64_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_enum_check_member,
+	.check_kflag_member = btf_enum_check_kflag_member,
+	.log_details = btf_enum_log,
+	.show = btf_enum64_show,
+};
+
 static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
 				     const struct btf_type *t,
 				     u32 meta_left)
@@ -4438,6 +4550,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT] = &float_ops,
 	[BTF_KIND_DECL_TAG] = &decl_tag_ops,
 	[BTF_KIND_TYPE_TAG] = &modifier_ops,
+	[BTF_KIND_ENUM64] = &enum64_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -5299,7 +5412,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
-	if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+	if (btf_type_is_small_int(t) || btf_is_any_enum(t))
 		/* accessing a scalar */
 		return true;
 	if (!btf_type_is_ptr(t)) {
@@ -5763,7 +5876,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
 	if (btf_type_is_ptr(t))
 		/* kernel size of pointer. Not BPF's size of pointer*/
 		return sizeof(void *);
-	if (btf_type_is_int(t) || btf_type_is_enum(t))
+	if (btf_type_is_int(t) || btf_is_any_enum(t))
 		return t->size;
 	*bad_type = t;
 	return -EINVAL;
@@ -5911,7 +6024,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
 		 * to context only. And only global functions can be replaced.
 		 * Hence type check only those types.
 		 */
-		if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+		if (btf_type_is_int(t1) || btf_is_any_enum(t1))
 			continue;
 		if (!btf_type_is_ptr(t1)) {
 			bpf_log(log,
@@ -6408,7 +6521,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
-	if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+	if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
 		bpf_log(log,
 			"Global function %s() doesn't return scalar. Only those are supported.\n",
 			tname);
@@ -6423,7 +6536,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
 			t = btf_type_by_id(btf, t->type);
-		if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+		if (btf_type_is_int(t) || btf_is_any_enum(t)) {
 			reg->type = SCALAR_VALUE;
 			continue;
 		}
@@ -7335,6 +7448,7 @@ recur:
 	case BTF_KIND_UNION:
 	case BTF_KIND_ENUM:
 	case BTF_KIND_FWD:
+	case BTF_KIND_ENUM64:
 		return 1;
 	case BTF_KIND_INT:
 		/* just reject deprecated bitfield-like integers; all other
@@ -7387,10 +7501,10 @@ recur:
  * field-based relocations. This function assumes that root types were already
  * checked for name match. Beyond that initial root-level name check, names
  * are completely ignored. Compatibility rules are as follows:
- *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
  *     kind should match for local and target types (i.e., STRUCT is not
  *     compatible with UNION);
- *   - for ENUMs, the size is ignored;
+ *   - for ENUMs/ENUM64s, the size is ignored;
  *   - for INT, size and signedness are ignored;
  *   - for ARRAY, dimensionality is ignored, element types are checked for
  *     compatibility recursively;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aedac2ac02b9..2d2872682278 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10901,7 +10901,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			goto err_free;
 		ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
 		scalar_return =
-			btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+			btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
 		if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
 			verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
 			goto err_free;
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index a9162a6c0284..ec1798b6d3ff 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -36,10 +36,10 @@ struct btf_type {
 	 * bits 24-28: kind (e.g. int, ptr, array...etc)
 	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
-	 *             struct, union and fwd
+	 *             struct, union, enum, fwd and enum64
 	 */
 	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC.
+	/* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64.
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
@@ -63,7 +63,7 @@ enum {
 	BTF_KIND_ARRAY		= 3,	/* Array	*/
 	BTF_KIND_STRUCT		= 4,	/* Struct	*/
 	BTF_KIND_UNION		= 5,	/* Union	*/
-	BTF_KIND_ENUM		= 6,	/* Enumeration	*/
+	BTF_KIND_ENUM		= 6,	/* Enumeration up to 32-bit values */
 	BTF_KIND_FWD		= 7,	/* Forward	*/
 	BTF_KIND_TYPEDEF	= 8,	/* Typedef	*/
 	BTF_KIND_VOLATILE	= 9,	/* Volatile	*/
@@ -76,6 +76,7 @@ enum {
 	BTF_KIND_FLOAT		= 16,	/* Floating point	*/
 	BTF_KIND_DECL_TAG	= 17,	/* Decl Tag */
 	BTF_KIND_TYPE_TAG	= 18,	/* Type Tag */
+	BTF_KIND_ENUM64		= 19,	/* Enumeration up to 64-bit values */
 
 	NR_BTF_KINDS,
 	BTF_KIND_MAX		= NR_BTF_KINDS - 1,
@@ -186,4 +187,14 @@ struct btf_decl_tag {
        __s32   component_idx;
 };
 
+/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64".
+ * The exact number of btf_enum64 is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum64 {
+	__u32	name_off;
+	__u32	val_lo32;
+	__u32	val_hi32;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
-- 
cgit v1.2.3


From ed2351174e38ad4febbbc0dba802803e6cff8ae0 Mon Sep 17 00:00:00 2001
From: Chenyi Qiang <chenyi.qiang@intel.com>
Date: Tue, 24 May 2022 21:56:21 +0800
Subject: KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple
 fault

For the triple fault sythesized by KVM, e.g. the RSM path or
nested_vmx_abort(), if KVM exits to userspace before the request is
serviced, userspace could migrate the VM and lose the triple fault.

Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault with a
new event KVM_VCPUEVENT_VALID_FAULT_FAULT so that userspace can save and
restore the triple fault event. This extension is guarded by a new KVM
capability KVM_CAP_TRIPLE_FAULT_EVENT.

Note that in the set_vcpu_events path, userspace is able to set/clear
the triple fault request through triple_fault.pending field.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Message-Id: <20220524135624.22988-2-chenyi.qiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  8 ++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/uapi/asm/kvm.h |  6 +++++-
 arch/x86/kvm/x86.c              | 21 ++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  1 +
 5 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 421479a67da5..f67e367c4059 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1150,6 +1150,10 @@ The following bits are defined in the flags field:
   fields contain a valid state. This bit will be set whenever
   KVM_CAP_EXCEPTION_PAYLOAD is enabled.
 
+- KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the
+  triple_fault_pending field contains a valid state. This bit will
+  be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled.
+
 ARM64:
 ^^^^^^
 
@@ -1245,6 +1249,10 @@ can be set in the flags field to signal that the
 exception_has_payload, exception_payload, and exception.pending fields
 contain a valid state and shall be written into the VCPU.
 
+If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT
+can be set in flags field to signal that the triple_fault field contains
+a valid state and shall be written into the VCPU.
+
 ARM64:
 ^^^^^^
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 032278f0ee6d..d6c62276e131 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1174,6 +1174,8 @@ struct kvm_arch {
 	bool guest_can_read_msr_platform_info;
 	bool exception_payload_enabled;
 
+	bool triple_fault_event;
+
 	bool bus_lock_detection_enabled;
 	bool enable_pmu;
 	/*
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 21614807a2cb..24c807c8d5f7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -325,6 +325,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
 #define KVM_VCPUEVENT_VALID_SMM		0x00000008
 #define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
+#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT	0x00000020
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -359,7 +360,10 @@ struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
-	__u8 reserved[27];
+	struct {
+		__u8 pending;
+	} triple_fault;
+	__u8 reserved[26];
 	__u8 exception_has_payload;
 	__u64 exception_payload;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c8dfdef9e52f..422fbb0d7518 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4296,6 +4296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_GET_MSR_FEATURES:
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
+	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_LAST_CPU:
 	case KVM_CAP_X86_USER_SPACE_MSR:
@@ -4942,6 +4943,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 			 | KVM_VCPUEVENT_VALID_SMM);
 	if (vcpu->kvm->arch.exception_payload_enabled)
 		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+	if (vcpu->kvm->arch.triple_fault_event) {
+		events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+	}
 
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -4955,7 +4960,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			      | KVM_VCPUEVENT_VALID_SHADOW
 			      | KVM_VCPUEVENT_VALID_SMM
-			      | KVM_VCPUEVENT_VALID_PAYLOAD))
+			      | KVM_VCPUEVENT_VALID_PAYLOAD
+			      | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
 		return -EINVAL;
 
 	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -5028,6 +5034,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+		if (!vcpu->kvm->arch.triple_fault_event)
+			return -EINVAL;
+		if (events->triple_fault.pending)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		else
+			kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+	}
+
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
@@ -6029,6 +6044,10 @@ split_irqchip_unlock:
 		kvm->arch.exception_payload_enabled = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+		kvm->arch.triple_fault_event = cap->args[0];
+		r = 0;
+		break;
 	case KVM_CAP_X86_USER_SPACE_MSR:
 		kvm->arch.user_space_msr_mask = cap->args[0];
 		r = 0;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c4a32910b88a..ca799319acfd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1158,6 +1158,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 #define KVM_CAP_S390_PROTECTED_DUMP 217
+#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001
From: Tao Xu <tao3.xu@intel.com>
Date: Tue, 24 May 2022 21:56:24 +0800
Subject: KVM: VMX: Enable Notify VM exit

There are cases that malicious virtual machines can cause CPU stuck (due
to event windows don't open up), e.g., infinite loop in microcode when
nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and
IRQ) can be delivered. It leads the CPU to be unavailable to host or
other VMs.

VMM can enable notify VM exit that a VM exit generated if no event
window occurs in VM non-root mode for a specified amount of time (notify
window).

Feature enabling:
- The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to
  enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust
  the expected notify window.
- Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space
  can query and enable this feature in per-VM scope. The argument is a
  64bit value: bits 63:32 are used for notify window, and bits 31:0 are
  for flags. Current supported flags:
  - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify
    window provided.
  - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen.
- It's safe to even set notify window to zero since an internal hardware
  threshold is added to vmcs.notify_window.

VM exit handling:
- Introduce a vcpu state notify_window_exits to records the count of
  notify VM exits and expose it through the debugfs.
- Notify VM exit can happen incident to delivery of a vector event.
  Allow it in KVM.
- Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID
  bit is set.

Nested handling
- Nested notify VM exits are not supported yet. Keep the same notify
  window control in vmcs02 as vmcs01, so that L1 can't escape the
  restriction of notify VM exits through launching L2 VM.

Notify VM exit is defined in latest Intel Architecture Instruction Set
Extensions Programming Reference, chapter 9.2.

Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Tao Xu <tao3.xu@intel.com>
Co-developed-by: Chenyi Qiang <chenyi.qiang@intel.com>
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst     | 49 ++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h    |  7 ++++++
 arch/x86/include/asm/vmx.h         |  7 ++++++
 arch/x86/include/asm/vmxfeatures.h |  1 +
 arch/x86/include/uapi/asm/vmx.h    |  4 +++-
 arch/x86/kvm/vmx/capabilities.h    |  6 +++++
 arch/x86/kvm/vmx/nested.c          |  8 +++++++
 arch/x86/kvm/vmx/vmx.c             | 40 +++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c                 | 22 ++++++++++++++++-
 arch/x86/kvm/x86.h                 |  7 ++++++
 include/uapi/linux/kvm.h           | 11 +++++++++
 11 files changed, 158 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index f67e367c4059..30e31a886422 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6557,6 +6557,26 @@ array field represents return values. The userspace should update the return
 values of SBI call before resuming the VCPU. For more details on RISC-V SBI
 spec refer, https://github.com/riscv/riscv-sbi-doc.
 
+::
+
+    /* KVM_EXIT_NOTIFY */
+    struct {
+  #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
+      __u32 flags;
+    } notify;
+
+Used on x86 systems. When the VM capability KVM_CAP_X86_NOTIFY_VMEXIT is
+enabled, a VM exit generated if no event window occurs in VM non-root mode
+for a specified amount of time. Once KVM_X86_NOTIFY_VMEXIT_USER is set when
+enabling the cap, it would exit to userspace with the exit reason
+KVM_EXIT_NOTIFY for further handling. The "flags" field contains more
+detailed info.
+
+The valid value for 'flags' is:
+
+  - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid
+    in VMCS. It would run into unknown result if resume the target VM.
+
 ::
 
 		/* Fix the size of the union. */
@@ -7523,6 +7543,35 @@ if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM
 uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as
 the maximum APIC ID.
 
+7.33 KVM_CAP_X86_NOTIFY_VMEXIT
+------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is the value of notify window as well as some flags
+:Returns: 0 on success, -EINVAL if args[0] contains invalid flags or notify
+          VM exit is unsupported.
+
+Bits 63:32 of args[0] are used for notify window.
+Bits 31:0 of args[0] are for some flags. Valid bits are::
+
+  #define KVM_X86_NOTIFY_VMEXIT_ENABLED    (1 << 0)
+  #define KVM_X86_NOTIFY_VMEXIT_USER       (1 << 1)
+
+This capability allows userspace to configure the notify VM exit on/off
+in per-VM scope during VM creation. Notify VM exit is disabled by default.
+When userspace sets KVM_X86_NOTIFY_VMEXIT_ENABLED bit in args[0], VMM will
+enable this feature with the notify window provided, which will generate
+a VM exit if no event window occurs in VM non-root mode for a specified of
+time (notify window).
+
+If KVM_X86_NOTIFY_VMEXIT_USER is set in args[0], upon notify VM exits happen,
+KVM would exit to userspace for handling.
+
+This capability is aimed to mitigate the threat that malicious VMs can
+cause CPU stuck (due to event windows don't open up) and make the CPU
+unavailable to host or other VMs.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4e00bca08cfa..6cf5d77d7896 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -65,6 +65,9 @@
 #define KVM_BUS_LOCK_DETECTION_VALID_MODE	(KVM_BUS_LOCK_DETECTION_OFF | \
 						 KVM_BUS_LOCK_DETECTION_EXIT)
 
+#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS	(KVM_X86_NOTIFY_VMEXIT_ENABLED | \
+						 KVM_X86_NOTIFY_VMEXIT_USER)
+
 /* x86-specific vcpu->requests bit members */
 #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
 #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
@@ -1178,6 +1181,9 @@ struct kvm_arch {
 
 	bool bus_lock_detection_enabled;
 	bool enable_pmu;
+
+	u32 notify_window;
+	u32 notify_vmexit_flags;
 	/*
 	 * If exit_on_emulation_error is set, and the in-kernel instruction
 	 * emulator fails to emulate an instruction, allow userspace
@@ -1325,6 +1331,7 @@ struct kvm_vcpu_stat {
 	u64 directed_yield_attempted;
 	u64 directed_yield_successful;
 	u64 guest_mode;
+	u64 notify_window_exits;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 89d2172787c5..c371ef695fcc 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -75,6 +75,7 @@
 #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
 #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING	VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
 
 /*
  * Definitions of Tertiary Processor-Based VM-Execution Controls.
@@ -280,6 +281,7 @@ enum vmcs_field {
 	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
 	PLE_GAP                         = 0x00004020,
 	PLE_WINDOW                      = 0x00004022,
+	NOTIFY_WINDOW                   = 0x00004024,
 	VM_INSTRUCTION_ERROR            = 0x00004400,
 	VM_EXIT_REASON                  = 0x00004402,
 	VM_EXIT_INTR_INFO               = 0x00004404,
@@ -564,6 +566,11 @@ enum vm_entry_failure_code {
 #define EPT_VIOLATION_GVA_IS_VALID	(1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
+/*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID     BIT(0)
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 589608c157bf..c6a7eed03914 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -85,6 +85,7 @@
 #define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
 #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
 #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
+#define VMX_FEATURE_NOTIFY_VM_EXITING	( 2*32+ 31) /* VM-Exit when no event windows after notify window */
 
 /* Tertiary Processor-Based VM-Execution Controls, word 3 */
 #define VMX_FEATURE_IPI_VIRT		( 3*32+  4) /* Enable IPI virtualization */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 946d761adbd3..a5faf6d88f1b 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -91,6 +91,7 @@
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
 #define EXIT_REASON_BUS_LOCK            74
+#define EXIT_REASON_NOTIFY              75
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -153,7 +154,8 @@
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
 	{ EXIT_REASON_TPAUSE,                "TPAUSE" }, \
-	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
+	{ EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }, \
+	{ EXIT_REASON_NOTIFY,                "NOTIFY" }
 
 #define VMX_EXIT_REASON_FLAGS \
 	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f14c4bef97e0..2d3f13b18714 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -436,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void)
 	return debugctl;
 }
 
+static inline bool cpu_has_notify_vmexit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5c5f4e3762f5..7d8cd0ebcc75 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2133,6 +2133,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
 
 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
 {
+	struct kvm *kvm = vmx->vcpu.kvm;
+
 	/*
 	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
 	 * according to L0's settings (vmcs12 is irrelevant here).  Host
@@ -2175,6 +2177,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
 	if (cpu_has_vmx_encls_vmexit())
 		vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
 
+	if (kvm_notify_vmexit_enabled(kvm))
+		vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
 	/*
 	 * Set the MSR load/store lists to match L0's settings.  Only the
 	 * addresses are constant (for vmcs02), the counts can change based
@@ -6112,6 +6117,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
 	case EXIT_REASON_ENCLS:
 		return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+	case EXIT_REASON_NOTIFY:
+		/* Notify VM exit is not exposed to L1 */
+		return false;
 	default:
 		return true;
 	}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6d631941ac1a..2e00890d752a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2499,7 +2499,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 			SECONDARY_EXEC_PT_USE_GPA |
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
-			SECONDARY_EXEC_BUS_LOCK_DETECTION;
+			SECONDARY_EXEC_BUS_LOCK_DETECTION |
+			SECONDARY_EXEC_NOTIFY_VM_EXITING;
 		if (cpu_has_sgx())
 			opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
 		if (adjust_vmx_controls(min2, opt2,
@@ -4417,6 +4418,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
 		exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
 
+	if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+		exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
 	return exec_control;
 }
 
@@ -4498,6 +4502,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		vmx->ple_window_dirty = true;
 	}
 
+	if (kvm_notify_vmexit_enabled(kvm))
+		vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
@@ -5784,6 +5791,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+	bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+	++vcpu->stat.notify_window_exits;
+
+	/*
+	 * Notify VM exit happened while executing iret from NMI,
+	 * "blocked by NMI" bit has to be set before next VM entry.
+	 */
+	if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+
+	if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+	    context_invalid) {
+		vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+		vcpu->run->notify.flags = context_invalid ?
+					  KVM_NOTIFY_CONTEXT_INVALID : 0;
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5841,6 +5874,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
 	[EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
+	[EXIT_REASON_NOTIFY]		      = handle_notify,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6214,7 +6248,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	     exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
 	     exit_reason.basic != EXIT_REASON_PML_FULL &&
 	     exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
-	     exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+	     exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+	     exit_reason.basic != EXIT_REASON_NOTIFY)) {
 		int ndata = 3;
 
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -8137,6 +8172,7 @@ static __init int hardware_setup(void)
 	kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
 	kvm_caps.tsc_scaling_ratio_frac_bits = 48;
 	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53e5f2ad2422..a8014233fd57 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -284,7 +284,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, nested_run),
 	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
-	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+	STATS_DESC_ICOUNTER(VCPU, guest_mode),
+	STATS_DESC_COUNTER(VCPU, notify_window_exits),
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -4402,6 +4403,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_DISABLE_QUIRKS2:
 		r = KVM_X86_VALID_QUIRKS;
 		break;
+	case KVM_CAP_X86_NOTIFY_VMEXIT:
+		r = kvm_caps.has_notify_vmexit;
+		break;
 	default:
 		break;
 	}
@@ -6125,6 +6129,22 @@ split_irqchip_unlock:
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_X86_NOTIFY_VMEXIT:
+		r = -EINVAL;
+		if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+			break;
+		if (!kvm_caps.has_notify_vmexit)
+			break;
+		if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+			break;
+		mutex_lock(&kvm->lock);
+		if (!kvm->created_vcpus) {
+			kvm->arch.notify_window = cap->args[0] >> 32;
+			kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 359d0454ad28..501b884b8cc4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -21,6 +21,8 @@ struct kvm_caps {
 	u64  default_tsc_scaling_ratio;
 	/* bus lock detection supported? */
 	bool has_bus_lock_exit;
+	/* notify VM exit supported? */
+	bool has_notify_vmexit;
 
 	u64 supported_mce_cap;
 	u64 supported_xcr0;
@@ -364,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
 	return kvm->arch.cstate_in_guest;
 }
 
+static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
+{
+	return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
+}
+
 enum kvm_intr_type {
 	/* Values are arbitrary, but must be non-zero. */
 	KVM_HANDLING_IRQ = 1,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca799319acfd..7569b4ec199c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -270,6 +270,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_X86_BUS_LOCK     33
 #define KVM_EXIT_XEN              34
 #define KVM_EXIT_RISCV_SBI        35
+#define KVM_EXIT_NOTIFY           36
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -496,6 +497,11 @@ struct kvm_run {
 			unsigned long args[6];
 			unsigned long ret[2];
 		} riscv_sbi;
+		/* KVM_EXIT_NOTIFY */
+		struct {
+#define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
+			__u32 flags;
+		} notify;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -1159,6 +1165,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
 #define KVM_CAP_S390_PROTECTED_DUMP 217
 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
+#define KVM_CAP_X86_NOTIFY_VMEXIT 219
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2174,4 +2181,8 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_S390_PROTECTED_DUMP */
 #define KVM_S390_PV_CPU_COMMAND	_IOWR(KVMIO, 0xd0, struct kvm_pv_cmd)
 
+/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */
+#define KVM_X86_NOTIFY_VMEXIT_ENABLED		(1ULL << 0)
+#define KVM_X86_NOTIFY_VMEXIT_USER		(1ULL << 1)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3


From 20e10881a043af63f2962a9e6bca64661225b383 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 8 Jun 2022 10:21:41 -0500
Subject: dma-buf: Add an API for exporting sync files (v14)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modern userspace APIs like Vulkan are built on an explicit
synchronization model.  This doesn't always play nicely with the
implicit synchronization used in the kernel and assumed by X11 and
Wayland.  The client -> compositor half of the synchronization isn't too
bad, at least on intel, because we can control whether or not i915
synchronizes on the buffer and whether or not it's considered written.

The harder part is the compositor -> client synchronization when we get
the buffer back from the compositor.  We're required to be able to
provide the client with a VkSemaphore and VkFence representing the point
in time where the window system (compositor and/or display) finished
using the buffer.  With current APIs, it's very hard to do this in such
a way that we don't get confused by the Vulkan driver's access of the
buffer.  In particular, once we tell the kernel that we're rendering to
the buffer again, any CPU waits on the buffer or GPU dependencies will
wait on some of the client rendering and not just the compositor.

This new IOCTL solves this problem by allowing us to get a snapshot of
the implicit synchronization state of a given dma-buf in the form of a
sync file.  It's effectively the same as a poll() or I915_GEM_WAIT only,
instead of CPU waiting directly, it encapsulates the wait operation, at
the current moment in time, in a sync_file so we can check/wait on it
later.  As long as the Vulkan driver does the sync_file export from the
dma-buf before we re-introduce it for rendering, it will only contain
fences from the compositor or display.  This allows to accurately turn
it into a VkFence or VkSemaphore without any over-synchronization.

By making this an ioctl on the dma-buf itself, it allows this new
functionality to be used in an entirely driver-agnostic way without
having access to a DRM fd. This makes it ideal for use in driver-generic
code in Mesa or in a client such as a compositor where the DRM fd may be
hard to reach.

v2 (Jason Ekstrand):
 - Use a wrapper dma_fence_array of all fences including the new one
   when importing an exclusive fence.

v3 (Jason Ekstrand):
 - Lock around setting shared fences as well as exclusive
 - Mark SIGNAL_SYNC_FILE as a read-write ioctl.
 - Initialize ret to 0 in dma_buf_wait_sync_file

v4 (Jason Ekstrand):
 - Use the new dma_resv_get_singleton helper

v5 (Jason Ekstrand):
 - Rename the IOCTLs to import/export rather than wait/signal
 - Drop the WRITE flag and always get/set the exclusive fence

v6 (Jason Ekstrand):
 - Drop the sync_file import as it was all-around sketchy and not nearly
   as useful as import.
 - Re-introduce READ/WRITE flag support for export
 - Rework the commit message

v7 (Jason Ekstrand):
 - Require at least one sync flag
 - Fix a refcounting bug: dma_resv_get_excl() doesn't take a reference
 - Use _rcu helpers since we're accessing the dma_resv read-only

v8 (Jason Ekstrand):
 - Return -ENOMEM if the sync_file_create fails
 - Predicate support on IS_ENABLED(CONFIG_SYNC_FILE)

v9 (Jason Ekstrand):
 - Add documentation for the new ioctl

v10 (Jason Ekstrand):
 - Go back to dma_buf_sync_file as the ioctl struct name

v11 (Daniel Vetter):
 - Go back to dma_buf_export_sync_file as the ioctl struct name
 - Better kerneldoc describing what the read/write flags do

v12 (Christian König):
 - Document why we chose to make it an ioctl on dma-buf

v13 (Jason Ekstrand):
 - Rebase on Christian König's fence rework

v14 (Daniel Vetter & Christian König):
 - Use dma_rev_usage_rw to get the properly inverted usage to pass to
   dma_resv_get_singleton()
 - Clean up the sync_file and fd if copy_to_user() fails

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Signed-off-by: Jason Ekstrand <jason.ekstrand@collabora.com>
Acked-by: Simon Ser <contact@emersion.fr>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Simon Ser <contact@emersion.fr>
Link: https://patchwork.freedesktop.org/patch/msgid/20220608152142.14495-2-jason@jlekstrand.net
---
 drivers/dma-buf/dma-buf.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/dma-buf.h | 35 +++++++++++++++++++++++
 2 files changed, 102 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index a2f9a1815e38..491797a5dfc8 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -20,6 +20,7 @@
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/sync_file.h>
 #include <linux/poll.h>
 #include <linux/dma-resv.h>
 #include <linux/mm.h>
@@ -192,6 +193,9 @@ static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
  * Note that this only signals the completion of the respective fences, i.e. the
  * DMA transfers are complete. Cache flushing and any other necessary
  * preparations before CPU access can begin still need to happen.
+ *
+ * As an alternative to poll(), the set of fences on DMA buffer can be
+ * exported as a &sync_file using &dma_buf_sync_file_export.
  */
 
 static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
@@ -326,6 +330,64 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_SYNC_FILE)
+static long dma_buf_export_sync_file(struct dma_buf *dmabuf,
+				     void __user *user_data)
+{
+	struct dma_buf_export_sync_file arg;
+	enum dma_resv_usage usage;
+	struct dma_fence *fence = NULL;
+	struct sync_file *sync_file;
+	int fd, ret;
+
+	if (copy_from_user(&arg, user_data, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags & ~DMA_BUF_SYNC_RW)
+		return -EINVAL;
+
+	if ((arg.flags & DMA_BUF_SYNC_RW) == 0)
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	usage = dma_resv_usage_rw(arg.flags & DMA_BUF_SYNC_WRITE);
+	ret = dma_resv_get_singleton(dmabuf->resv, usage, &fence);
+	if (ret)
+		goto err_put_fd;
+
+	if (!fence)
+		fence = dma_fence_get_stub();
+
+	sync_file = sync_file_create(fence);
+
+	dma_fence_put(fence);
+
+	if (!sync_file) {
+		ret = -ENOMEM;
+		goto err_put_fd;
+	}
+
+	arg.fd = fd;
+	if (copy_to_user(user_data, &arg, sizeof(arg))) {
+		ret = -EFAULT;
+		goto err_put_file;
+	}
+
+	fd_install(fd, sync_file->file);
+
+	return 0;
+
+err_put_file:
+	fput(sync_file->file);
+err_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+#endif
+
 static long dma_buf_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -369,6 +431,11 @@ static long dma_buf_ioctl(struct file *file,
 	case DMA_BUF_SET_NAME_B:
 		return dma_buf_set_name(dmabuf, (const char __user *)arg);
 
+#if IS_ENABLED(CONFIG_SYNC_FILE)
+	case DMA_BUF_IOCTL_EXPORT_SYNC_FILE:
+		return dma_buf_export_sync_file(dmabuf, (void __user *)arg);
+#endif
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index 8e4a2ca0bcbf..46f1e3e98b02 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -85,6 +85,40 @@ struct dma_buf_sync {
 
 #define DMA_BUF_NAME_LEN	32
 
+/**
+ * struct dma_buf_export_sync_file - Get a sync_file from a dma-buf
+ *
+ * Userspace can perform a DMA_BUF_IOCTL_EXPORT_SYNC_FILE to retrieve the
+ * current set of fences on a dma-buf file descriptor as a sync_file.  CPU
+ * waits via poll() or other driver-specific mechanisms typically wait on
+ * whatever fences are on the dma-buf at the time the wait begins.  This
+ * is similar except that it takes a snapshot of the current fences on the
+ * dma-buf for waiting later instead of waiting immediately.  This is
+ * useful for modern graphics APIs such as Vulkan which assume an explicit
+ * synchronization model but still need to inter-operate with dma-buf.
+ */
+struct dma_buf_export_sync_file {
+	/**
+	 * @flags: Read/write flags
+	 *
+	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
+	 *
+	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
+	 * the returned sync file waits on any writers of the dma-buf to
+	 * complete.  Waiting on the returned sync file is equivalent to
+	 * poll() with POLLIN.
+	 *
+	 * If DMA_BUF_SYNC_WRITE is set, the returned sync file waits on
+	 * any users of the dma-buf (read or write) to complete.  Waiting
+	 * on the returned sync file is equivalent to poll() with POLLOUT.
+	 * If both DMA_BUF_SYNC_WRITE and DMA_BUF_SYNC_READ are set, this
+	 * is equivalent to just DMA_BUF_SYNC_WRITE.
+	 */
+	__u32 flags;
+	/** @fd: Returned sync file descriptor */
+	__s32 fd;
+};
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
 
@@ -94,5 +128,6 @@ struct dma_buf_sync {
 #define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
 #define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, u32)
 #define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, u64)
+#define DMA_BUF_IOCTL_EXPORT_SYNC_FILE	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file)
 
 #endif
-- 
cgit v1.2.3


From 594740497e998d30477ab26093bfb81c28cd3ff1 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Wed, 8 Jun 2022 10:21:42 -0500
Subject: dma-buf: Add an API for importing sync files (v10)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch is analogous to the previous sync file export patch in that
it allows you to import a sync_file into a dma-buf.  Unlike the previous
patch, however, this does add genuinely new functionality to dma-buf.
Without this, the only way to attach a sync_file to a dma-buf is to
submit a batch to your driver of choice which waits on the sync_file and
claims to write to the dma-buf.  Even if said batch is a no-op, a submit
is typically way more overhead than just attaching a fence.  A submit
may also imply extra synchronization with other work because it happens
on a hardware queue.

In the Vulkan world, this is useful for dealing with the out-fence from
vkQueuePresent.  Current Linux window-systems (X11, Wayland, etc.) all
rely on dma-buf implicit sync.  Since Vulkan is an explicit sync API, we
get a set of fences (VkSemaphores) in vkQueuePresent and have to stash
those as an exclusive (write) fence on the dma-buf.  We handle it in
Mesa today with the above mentioned dummy submit trick.  This ioctl
would allow us to set it directly without the dummy submit.

This may also open up possibilities for GPU drivers to move away from
implicit sync for their kernel driver uAPI and instead provide sync
files and rely on dma-buf import/export for communicating with other
implicit sync clients.

We make the explicit choice here to only allow setting RW fences which
translates to an exclusive fence on the dma_resv.  There's no use for
read-only fences for communicating with other implicit sync userspace
and any such attempts are likely to be racy at best.  When we got to
insert the RW fence, the actual fence we set as the new exclusive fence
is a combination of the sync_file provided by the user and all the other
fences on the dma_resv.  This ensures that the newly added exclusive
fence will never signal before the old one would have and ensures that
we don't break any dma_resv contracts.  We require userspace to specify
RW in the flags for symmetry with the export ioctl and in case we ever
want to support read fences in the future.

There is one downside here that's worth documenting:  If two clients
writing to the same dma-buf using this API race with each other, their
actions on the dma-buf may happen in parallel or in an undefined order.
Both with and without this API, the pattern is the same:  Collect all
the fences on dma-buf, submit work which depends on said fences, and
then set a new exclusive (write) fence on the dma-buf which depends on
said work.  The difference is that, when it's all handled by the GPU
driver's submit ioctl, the three operations happen atomically under the
dma_resv lock.  If two userspace submits race, one will happen before
the other.  You aren't guaranteed which but you are guaranteed that
they're strictly ordered.  If userspace manages the fences itself, then
these three operations happen separately and the two render operations
may happen genuinely in parallel or get interleaved.  However, this is a
case of userspace racing with itself.  As long as we ensure userspace
can't back the kernel into a corner, it should be fine.

v2 (Jason Ekstrand):
 - Use a wrapper dma_fence_array of all fences including the new one
   when importing an exclusive fence.

v3 (Jason Ekstrand):
 - Lock around setting shared fences as well as exclusive
 - Mark SIGNAL_SYNC_FILE as a read-write ioctl.
 - Initialize ret to 0 in dma_buf_wait_sync_file

v4 (Jason Ekstrand):
 - Use the new dma_resv_get_singleton helper

v5 (Jason Ekstrand):
 - Rename the IOCTLs to import/export rather than wait/signal
 - Drop the WRITE flag and always get/set the exclusive fence

v6 (Jason Ekstrand):
 - Split import and export into separate patches
 - New commit message

v7 (Daniel Vetter):
 - Fix the uapi header to use the right struct in the ioctl
 - Use a separate dma_buf_import_sync_file struct
 - Add kerneldoc for dma_buf_import_sync_file

v8 (Jason Ekstrand):
 - Rebase on Christian König's fence rework

v9 (Daniel Vetter):
 - Fix -EINVAL checks for the flags parameter
 - Add documentation about read/write fences
 - Add documentation about the expected usage of import/export and
   specifically call out the possible userspace race.

v10 (Simon Ser):
 - Fix a typo in the docs

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Signed-off-by: Jason Ekstrand <jason.ekstrand@collabora.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Simon Ser <contact@emersion.fr>
Link: https://patchwork.freedesktop.org/patch/msgid/20220608152142.14495-3-jason@jlekstrand.net
---
 drivers/dma-buf/dma-buf.c    | 39 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/dma-buf.h | 49 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 491797a5dfc8..5e1b0534b3ce 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -386,6 +386,43 @@ err_put_fd:
 	put_unused_fd(fd);
 	return ret;
 }
+
+static long dma_buf_import_sync_file(struct dma_buf *dmabuf,
+				     const void __user *user_data)
+{
+	struct dma_buf_import_sync_file arg;
+	struct dma_fence *fence;
+	enum dma_resv_usage usage;
+	int ret = 0;
+
+	if (copy_from_user(&arg, user_data, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags & ~DMA_BUF_SYNC_RW)
+		return -EINVAL;
+
+	if ((arg.flags & DMA_BUF_SYNC_RW) == 0)
+		return -EINVAL;
+
+	fence = sync_file_get_fence(arg.fd);
+	if (!fence)
+		return -EINVAL;
+
+	usage = (arg.flags & DMA_BUF_SYNC_WRITE) ? DMA_RESV_USAGE_WRITE :
+						   DMA_RESV_USAGE_READ;
+
+	dma_resv_lock(dmabuf->resv, NULL);
+
+	ret = dma_resv_reserve_fences(dmabuf->resv, 1);
+	if (!ret)
+		dma_resv_add_fence(dmabuf->resv, fence, usage);
+
+	dma_resv_unlock(dmabuf->resv);
+
+	dma_fence_put(fence);
+
+	return ret;
+}
 #endif
 
 static long dma_buf_ioctl(struct file *file,
@@ -434,6 +471,8 @@ static long dma_buf_ioctl(struct file *file,
 #if IS_ENABLED(CONFIG_SYNC_FILE)
 	case DMA_BUF_IOCTL_EXPORT_SYNC_FILE:
 		return dma_buf_export_sync_file(dmabuf, (void __user *)arg);
+	case DMA_BUF_IOCTL_IMPORT_SYNC_FILE:
+		return dma_buf_import_sync_file(dmabuf, (const void __user *)arg);
 #endif
 
 	default:
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index 46f1e3e98b02..30fb8834aa3c 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -96,6 +96,24 @@ struct dma_buf_sync {
  * dma-buf for waiting later instead of waiting immediately.  This is
  * useful for modern graphics APIs such as Vulkan which assume an explicit
  * synchronization model but still need to inter-operate with dma-buf.
+ *
+ * The intended usage pattern is the following:
+ *
+ *  1. Export a sync_file with flags corresponding to the expected GPU usage
+ *     via DMA_BUF_IOCTL_EXPORT_SYNC_FILE.
+ *
+ *  2. Submit rendering work which uses the dma-buf.  The work should wait on
+ *     the exported sync file before rendering and produce another sync_file
+ *     when complete.
+ *
+ *  3. Import the rendering-complete sync_file into the dma-buf with flags
+ *     corresponding to the GPU usage via DMA_BUF_IOCTL_IMPORT_SYNC_FILE.
+ *
+ * Unlike doing implicit synchronization via a GPU kernel driver's exec ioctl,
+ * the above is not a single atomic operation.  If userspace wants to ensure
+ * ordering via these fences, it is the respnosibility of userspace to use
+ * locks or other mechanisms to ensure that no other context adds fences or
+ * submits work between steps 1 and 3 above.
  */
 struct dma_buf_export_sync_file {
 	/**
@@ -119,6 +137,36 @@ struct dma_buf_export_sync_file {
 	__s32 fd;
 };
 
+/**
+ * struct dma_buf_import_sync_file - Insert a sync_file into a dma-buf
+ *
+ * Userspace can perform a DMA_BUF_IOCTL_IMPORT_SYNC_FILE to insert a
+ * sync_file into a dma-buf for the purposes of implicit synchronization
+ * with other dma-buf consumers.  This allows clients using explicitly
+ * synchronized APIs such as Vulkan to inter-op with dma-buf consumers
+ * which expect implicit synchronization such as OpenGL or most media
+ * drivers/video.
+ */
+struct dma_buf_import_sync_file {
+	/**
+	 * @flags: Read/write flags
+	 *
+	 * Must be DMA_BUF_SYNC_READ, DMA_BUF_SYNC_WRITE, or both.
+	 *
+	 * If DMA_BUF_SYNC_READ is set and DMA_BUF_SYNC_WRITE is not set,
+	 * this inserts the sync_file as a read-only fence.  Any subsequent
+	 * implicitly synchronized writes to this dma-buf will wait on this
+	 * fence but reads will not.
+	 *
+	 * If DMA_BUF_SYNC_WRITE is set, this inserts the sync_file as a
+	 * write fence.  All subsequent implicitly synchronized access to
+	 * this dma-buf will wait on this fence.
+	 */
+	__u32 flags;
+	/** @fd: Sync file descriptor */
+	__s32 fd;
+};
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
 
@@ -129,5 +177,6 @@ struct dma_buf_export_sync_file {
 #define DMA_BUF_SET_NAME_A	_IOW(DMA_BUF_BASE, 1, u32)
 #define DMA_BUF_SET_NAME_B	_IOW(DMA_BUF_BASE, 1, u64)
 #define DMA_BUF_IOCTL_EXPORT_SYNC_FILE	_IOWR(DMA_BUF_BASE, 2, struct dma_buf_export_sync_file)
+#define DMA_BUF_IOCTL_IMPORT_SYNC_FILE	_IOW(DMA_BUF_BASE, 3, struct dma_buf_import_sync_file)
 
 #endif
-- 
cgit v1.2.3


From b489a6e5871690735752f8875f411e4d0cd8e5df Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 8 Jun 2022 18:34:25 +0300
Subject: tls: Rename TLS_INFO_ZC_SENDFILE to TLS_INFO_ZC_TX

To embrace possible future optimizations of TLS, rename zerocopy
sendfile definitions to more generic ones:

* setsockopt: TLS_TX_ZEROCOPY_SENDFILE- > TLS_TX_ZEROCOPY_RO
* sock_diag: TLS_INFO_ZC_SENDFILE -> TLS_INFO_ZC_RO_TX

RO stands for readonly and emphasizes that the application shouldn't
modify the data being transmitted with zerocopy to avoid potential
disconnection.

Fixes: c1318b39c7d3 ("tls: Add opt-in zerocopy mode of sendfile()")
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Link: https://lore.kernel.org/r/20220608153425.3151146-1-maximmi@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tls.h | 4 ++--
 net/tls/tls_main.c       | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index ac39328eabe7..bb8f80812b0b 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -39,7 +39,7 @@
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
 #define TLS_RX			2	/* Set receive parameters */
-#define TLS_TX_ZEROCOPY_SENDFILE	3	/* transmit zerocopy sendfile */
+#define TLS_TX_ZEROCOPY_RO	3	/* TX zerocopy (only sendfile now) */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -161,7 +161,7 @@ enum {
 	TLS_INFO_CIPHER,
 	TLS_INFO_TXCONF,
 	TLS_INFO_RXCONF,
-	TLS_INFO_ZC_SENDFILE,
+	TLS_INFO_ZC_RO_TX,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b91ddc110786..da176411c1b5 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -544,7 +544,7 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
 		rc = do_tls_getsockopt_conf(sk, optval, optlen,
 					    optname == TLS_TX);
 		break;
-	case TLS_TX_ZEROCOPY_SENDFILE:
+	case TLS_TX_ZEROCOPY_RO:
 		rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
 		break;
 	default:
@@ -731,7 +731,7 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 					    optname == TLS_TX);
 		release_sock(sk);
 		break;
-	case TLS_TX_ZEROCOPY_SENDFILE:
+	case TLS_TX_ZEROCOPY_RO:
 		lock_sock(sk);
 		rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
 		release_sock(sk);
@@ -970,7 +970,7 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
 		goto nla_failure;
 
 	if (ctx->tx_conf == TLS_HW && ctx->zerocopy_sendfile) {
-		err = nla_put_flag(skb, TLS_INFO_ZC_SENDFILE);
+		err = nla_put_flag(skb, TLS_INFO_ZC_RO_TX);
 		if (err)
 			goto nla_failure;
 	}
@@ -994,7 +994,7 @@ static size_t tls_get_info_size(const struct sock *sk)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_CIPHER */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_RXCONF */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
-		nla_total_size(0) +		/* TLS_INFO_ZC_SENDFILE */
+		nla_total_size(0) +		/* TLS_INFO_ZC_RO_TX */
 		0;
 
 	return size;
-- 
cgit v1.2.3


From 6b2a51ff03bf0c54cbc699ee85a9a49eb203ebfc Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 20 May 2022 18:15:01 +0000
Subject: fscrypt: Add HCTR2 support for filename encryption

HCTR2 is a tweakable, length-preserving encryption mode that is intended
for use on CPUs with dedicated crypto instructions.  HCTR2 has the
property that a bitflip in the plaintext changes the entire ciphertext.
This property fixes a known weakness with filename encryption: when two
filenames in the same directory share a prefix of >= 16 bytes, with
AES-CTS-CBC their encrypted filenames share a common substring, leaking
information.  HCTR2 does not have this problem.

More information on HCTR2 can be found here: "Length-preserving
encryption with HCTR2": https://eprint.iacr.org/2021/1441.pdf

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/filesystems/fscrypt.rst | 22 +++++++++++++++++-----
 fs/crypto/fscrypt_private.h           |  2 +-
 fs/crypto/keysetup.c                  |  7 +++++++
 fs/crypto/policy.c                    | 14 +++++++++++---
 include/uapi/linux/fscrypt.h          |  3 ++-
 5 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 2e9aaa295125..5ba5817c17c2 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -337,6 +337,7 @@ Currently, the following pairs of encryption modes are supported:
 - AES-256-XTS for contents and AES-256-CTS-CBC for filenames
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
 - Adiantum for both contents and filenames
+- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
 
 If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
 
@@ -357,6 +358,17 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled.  Also, fast
 implementations of ChaCha and NHPoly1305 should be enabled, e.g.
 CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
 
+AES-256-HCTR2 is another true wide-block encryption mode that is intended for
+use on CPUs with dedicated crypto instructions.  AES-256-HCTR2 has the property
+that a bitflip in the plaintext changes the entire ciphertext.  This property
+makes it desirable for filename encryption since initialization vectors are
+reused within a directory.  For more details on AES-256-HCTR2, see the paper
+"Length-preserving encryption with HCTR2"
+(https://eprint.iacr.org/2021/1441.pdf).  To use AES-256-HCTR2,
+CONFIG_CRYPTO_HCTR2 must be enabled.  Also, fast implementations of XCTR and
+POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
+CRYPTO_AES_ARM64_CE_BLK for ARM64.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing
@@ -404,11 +416,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
 inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
 Thus, IV reuse is limited to within a single directory.
 
-With CTS-CBC, the IV reuse means that when the plaintext filenames
-share a common prefix at least as long as the cipher block size (16
-bytes for AES), the corresponding encrypted filenames will also share
-a common prefix.  This is undesirable.  Adiantum does not have this
-weakness, as it is a wide-block encryption mode.
+With CTS-CBC, the IV reuse means that when the plaintext filenames share a
+common prefix at least as long as the cipher block size (16 bytes for AES), the
+corresponding encrypted filenames will also share a common prefix.  This is
+undesirable.  Adiantum and HCTR2 do not have this weakness, as they are
+wide-block encryption modes.
 
 All supported filenames encryption modes accept any plaintext length
 >= 16 bytes; cipher block alignment is not required.  However,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 6b4c8094cc7b..f5be777d8279 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -31,7 +31,7 @@
 #define FSCRYPT_CONTEXT_V2	2
 
 /* Keep this in sync with include/uapi/linux/fscrypt.h */
-#define FSCRYPT_MODE_MAX	FSCRYPT_MODE_ADIANTUM
+#define FSCRYPT_MODE_MAX	FSCRYPT_MODE_AES_256_HCTR2
 
 struct fscrypt_context_v1 {
 	u8 version; /* FSCRYPT_CONTEXT_V1 */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index c35711896bd4..fbc71abdabe3 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -53,6 +53,13 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.ivsize = 32,
 		.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
 	},
+	[FSCRYPT_MODE_AES_256_HCTR2] = {
+		.friendly_name = "AES-256-HCTR2",
+		.cipher_str = "hctr2(aes)",
+		.keysize = 32,
+		.security_strength = 32,
+		.ivsize = 32,
+	},
 };
 
 static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 5f858cee1e3b..8a054e6d1e68 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb)
 	return sb->s_cop->get_dummy_policy(sb);
 }
 
-static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
+static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
 {
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
 	    filenames_mode == FSCRYPT_MODE_AES_256_CTS)
@@ -78,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
 	return false;
 }
 
+static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
+{
+	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+	    filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
+		return true;
+	return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
+}
+
 static bool supported_direct_key_modes(const struct inode *inode,
 				       u32 contents_mode, u32 filenames_mode)
 {
@@ -151,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
 					const struct inode *inode)
 {
-	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+	if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode,
 				     policy->filenames_encryption_mode)) {
 		fscrypt_warn(inode,
 			     "Unsupported encryption modes (contents %d, filenames %d)",
@@ -187,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 {
 	int count = 0;
 
-	if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+	if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode,
 				     policy->filenames_encryption_mode)) {
 		fscrypt_warn(inode,
 			     "Unsupported encryption modes (contents %d, filenames %d)",
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 9f4428be3e36..a756b29afcc2 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -27,7 +27,8 @@
 #define FSCRYPT_MODE_AES_128_CBC		5
 #define FSCRYPT_MODE_AES_128_CTS		6
 #define FSCRYPT_MODE_ADIANTUM			9
-/* If adding a mode number > 9, update FSCRYPT_MODE_MAX in fscrypt_private.h */
+#define FSCRYPT_MODE_AES_256_HCTR2		10
+/* If adding a mode number > 10, update FSCRYPT_MODE_MAX in fscrypt_private.h */
 
 /*
  * Legacy policy version; ad-hoc KDF and no key verification.
-- 
cgit v1.2.3


From 924b290655c0f17ac84e752addfc9bc3ec361069 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 24 May 2022 13:47:40 -0700
Subject: xfrm: convert alg_key to flexible array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Iproute2 build generates a warning when built with gcc-12.
This is because the alg_key in xfrm.h API has zero size
array element instead of flexible array.

    CC       xfrm_state.o
In function ‘xfrm_algo_parse’,
    inlined from ‘xfrm_state_modify.constprop’ at xfrm_state.c:573:5:
xfrm_state.c:162:32: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
  162 |                         buf[j] = val;
      |                         ~~~~~~~^~~~~

This patch convert the alg_key into flexible array member.
There are other zero size arrays here that should be converted as
well.

This patch is RFC only since it is only compile tested and
passes trivial iproute2 tests.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..3ed61df9cc91 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -102,21 +102,21 @@ struct xfrm_replay_state_esn {
 struct xfrm_algo {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_auth {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
 	unsigned int	alg_trunc_len;  /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_aead {
 	char		alg_name[64];
 	unsigned int	alg_key_len;	/* in bits */
 	unsigned int	alg_icv_len;	/* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_stats {
-- 
cgit v1.2.3


From 20646f5b1e798bcc20044ae90ac3702f177bf254 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 Jun 2022 17:23:45 +0200
Subject: netfilter: xtables: Bring SPDX identifier back

Commit e2be04c7f995 ("License cleanup: add SPDX license identifier to
uapi header files with a license") added the correct SPDX identifier to
include/uapi/linux/netfilter/xt_IDLETIMER.h.

A subsequent commit removed it for no reason and reintroduced the UAPI
license incorrectness as the file is now missing the UAPI exception
again.

Add it back and remove the GPLv2 boilerplate while at it.

Fixes: 68983a354a65 ("netfilter: xtables: Add snapshot of hardidletimer target")
Cc: Manoj Basapathi <manojbm@codeaurora.org>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/netfilter/xt_IDLETIMER.h | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 49ddcdc61c09..7bfb31a66fc9 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -1,6 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
 /*
- * linux/include/linux/netfilter/xt_IDLETIMER.h
- *
  * Header file for Xtables timer target module.
  *
  * Copyright (C) 2004, 2010 Nokia Corporation
@@ -10,20 +9,6 @@
  * by Luciano Coelho <luciano.coelho@nokia.com>
  *
  * Contact: Luciano Coelho <luciano.coelho@nokia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
  */
 
 #ifndef _XT_IDLETIMER_H
-- 
cgit v1.2.3


From 17472bc2c3d00e4ed72c3c7b9af0697edea02fff Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:11:27 +0200
Subject: wifi: nl80211: fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Link: https://lore.kernel.org/r/20220521111145.81697-77-Julia.Lawall@inria.fr
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d9490e3062a7..98f905f16411 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5874,7 +5874,7 @@ enum nl80211_ap_sme_features {
  * @NL80211_FEATURE_INACTIVITY_TIMER: This driver takes care of freeing up
  *	the connected inactive stations in AP mode.
  * @NL80211_FEATURE_CELL_BASE_REG_HINTS: This driver has been tested
- *	to work properly to suppport receiving regulatory hints from
+ *	to work properly to support receiving regulatory hints from
  *	cellular base stations.
  * @NL80211_FEATURE_P2P_DEVICE_NEEDS_CHANNEL: (no longer available, only
  *	here to reserve the value for API/ABI compatibility)
-- 
cgit v1.2.3


From d884b6498d2f022098502e106d5a45ab635f2e9a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 14 Jun 2022 17:51:18 +0100
Subject: io_uring: remove IORING_CLOSE_FD_AND_FILE_SLOT

This partially reverts a7c41b4687f5902af70cd559806990930c8a307b

Even though IORING_CLOSE_FD_AND_FILE_SLOT might save cycles for some
users, but it tries to do two things at a time and it's not clear how to
handle errors and what to return in a single result field when one part
fails and another completes well. Kill it for now.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/837c745019b3795941eee4fcfd7de697886d645b.1655224415.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 12 +++---------
 include/uapi/linux/io_uring.h |  6 ------
 2 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1b95c6750a81..1b0b6099e717 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -576,7 +576,6 @@ struct io_close {
 	struct file			*file;
 	int				fd;
 	u32				file_slot;
-	u32				flags;
 };
 
 struct io_timeout_data {
@@ -5966,18 +5965,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (sqe->off || sqe->addr || sqe->len || sqe->buf_index)
+	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
 
 	req->close.fd = READ_ONCE(sqe->fd);
 	req->close.file_slot = READ_ONCE(sqe->file_index);
-	req->close.flags = READ_ONCE(sqe->close_flags);
-	if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT)
-		return -EINVAL;
-	if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) &&
-	    req->close.file_slot && req->close.fd)
+	if (req->close.file_slot && req->close.fd)
 		return -EINVAL;
 
 	return 0;
@@ -5993,8 +5988,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (req->close.file_slot) {
 		ret = io_close_fixed(req, issue_flags);
-		if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT))
-			goto err;
+		goto err;
 	}
 
 	spin_lock(&files->file_lock);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 776e0278f9dd..53e7dae92e42 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -47,7 +47,6 @@ struct io_uring_sqe {
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
-		__u32		close_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -259,11 +258,6 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
-/*
- * close flags, store in sqe->close_flags
- */
-#define IORING_CLOSE_FD_AND_FILE_SLOT	(1U << 0)
-
 /*
  * IO completion data structure (Completion Queue Entry)
  */
-- 
cgit v1.2.3


From 9731dd4cadc53251ef80b3655c8d841fed52fa3d Mon Sep 17 00:00:00 2001
From: Daniel Phillips <Daniel.Phillips@amd.com>
Date: Mon, 30 May 2022 11:21:22 -0400
Subject: drm/amdkfd: Add available memory ioctl

Add a new KFD ioctl to return the largest possible memory size that
can be allocated as a buffer object using
kfd_ioctl_alloc_memory_of_gpu. It attempts to use exactly the same
accept/reject criteria as that function so that allocating a new
buffer object of the size returned by this new ioctl is guaranteed to
succeed, barring races with other allocating tasks.

This IOCTL will be used by libhsakmt:
https://www.mail-archive.com/amd-gfx@lists.freedesktop.org/msg75743.html

Signed-off-by: Daniel Phillips <Daniel.Phillips@amd.com>
Signed-off-by: David Yat Sin <David.YatSin@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 38 +++++++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         | 34 +++++++++++++++++++++
 include/uapi/linux/kfd_ioctl.h                   | 14 +++++++--
 4 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ffb2b7d9b9a5..648c031942e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -268,6 +268,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
 void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
 					void *drm_priv);
 uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev);
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 		struct amdgpu_device *adev, uint64_t va, uint64_t size,
 		void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f386b0d256d2..8805bd1eed37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -38,6 +38,12 @@
  */
 #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
 
+/*
+ * Align VRAM allocations to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
+ * BO chunk
+ */
+#define VRAM_ALLOCATION_ALIGN (1 << 21)
+
 /* Impose limit on how much memory KFD can use */
 static struct {
 	uint64_t max_system_mem_limit;
@@ -108,7 +114,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  * compromise that should work in most cases without reserving too
  * much memory for page tables unnecessarily (factor 16K, >> 14).
  */
-#define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
+#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), AMDGPU_VM_RESERVED_VRAM)
 
 static size_t amdgpu_amdkfd_acc_size(uint64_t size)
 {
@@ -148,7 +154,13 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
 		system_mem_needed = acc_size;
 		ttm_mem_needed = acc_size;
-		vram_needed = size;
+
+		/*
+		 * Conservatively round up the allocation requirement to 2 MB
+		 * to avoid fragmentation caused by 4K allocations in the tail
+		 * 2M BO chunk.
+		 */
+		vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
 	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
 		system_mem_needed = acc_size + size;
 		ttm_mem_needed = acc_size;
@@ -173,7 +185,9 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 	    (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 	     kfd_mem_limit.max_ttm_mem_limit) ||
 	    (adev->kfd.vram_used + vram_needed >
-	     adev->gmc.real_vram_size - reserved_for_pt)) {
+	     adev->gmc.real_vram_size -
+	     atomic64_read(&adev->vram_pin_size) -
+	     reserved_for_pt)) {
 		ret = -ENOMEM;
 		goto release;
 	}
@@ -205,7 +219,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
 	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
 		kfd_mem_limit.system_mem_used -= acc_size;
 		kfd_mem_limit.ttm_mem_used -= acc_size;
-		adev->kfd.vram_used -= size;
+		adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
 	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
 		kfd_mem_limit.system_mem_used -= (acc_size + size);
 		kfd_mem_limit.ttm_mem_used -= acc_size;
@@ -1633,6 +1647,22 @@ out_unlock:
 	return ret;
 }
 
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
+{
+	uint64_t reserved_for_pt =
+		ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+	size_t available;
+
+	spin_lock(&kfd_mem_limit.mem_limit_lock);
+	available = adev->gmc.real_vram_size
+		- adev->kfd.vram_used
+		- atomic64_read(&adev->vram_pin_size)
+		- reserved_for_pt;
+	spin_unlock(&kfd_mem_limit.mem_limit_lock);
+
+	return ALIGN_DOWN(available, VRAM_ALLOCATION_ALIGN);
+}
+
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 		struct amdgpu_device *adev, uint64_t va, uint64_t size,
 		void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1c7016958d6d..3bcf9bf29acb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -65,6 +65,25 @@ static int kfd_char_dev_major = -1;
 static struct class *kfd_class;
 struct device *kfd_device;
 
+static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
+{
+	struct kfd_process_device *pdd;
+
+	mutex_lock(&p->mutex);
+	pdd = kfd_process_device_data_by_id(p, gpu_id);
+
+	if (pdd)
+		return pdd;
+
+	mutex_unlock(&p->mutex);
+	return NULL;
+}
+
+static inline void kfd_unlock_pdd(struct kfd_process_device *pdd)
+{
+	mutex_unlock(&pdd->process->mutex);
+}
+
 int kfd_chardev_init(void)
 {
 	int err = 0;
@@ -958,6 +977,19 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 	return false;
 }
 
+static int kfd_ioctl_get_available_memory(struct file *filep,
+					  struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_available_memory_args *args = data;
+	struct kfd_process_device *pdd = kfd_lock_pdd_by_id(p, args->gpu_id);
+
+	if (!pdd)
+		return -EINVAL;
+	args->available = amdgpu_amdkfd_get_available_memory(pdd->dev->adev);
+	kfd_unlock_pdd(pdd);
+	return 0;
+}
+
 static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
 					struct kfd_process *p, void *data)
 {
@@ -2648,6 +2680,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
 			kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
 
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+			kfd_ioctl_get_available_memory, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 42975e940758..231eb010b823 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -34,9 +34,10 @@
  * - 1.6 - Query clear flags in SVM get_attr API
  * - 1.7 - Checkpoint Restore (CRIU) API
  * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
+ * - 1.9 - Add available memory ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 8
+#define KFD_IOCTL_MINOR_VERSION 9
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -100,6 +101,12 @@ struct kfd_ioctl_get_queue_wave_state_args {
 	__u32 pad;
 };
 
+struct kfd_ioctl_get_available_memory_args {
+	__u64 available;	/* from KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -826,7 +833,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_CRIU_OP			\
 		AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
 
+#define AMDKFD_IOC_AVAILABLE_MEMORY		\
+		AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x23
+#define AMDKFD_COMMAND_END		0x24
 
 #endif
-- 
cgit v1.2.3


From d30dfd490f7dc4cb6a7c11a647bd1ff7a22139e7 Mon Sep 17 00:00:00 2001
From: Justin Stitt <jstitt007@gmail.com>
Date: Wed, 8 Jun 2022 15:35:39 -0700
Subject: include/uapi/linux/swab.h: move explicit cast outside ternary

A cast inside __builtin_constant_p doesn't do anything since it should
evaluate as constant at compile time irrespective of this cast.  Instead,
I moved this cast outside the ternary to ensure the return type is as
expected.

Additionally, if __HAVE_BUILTIN_BSWAP16__ was not defined then __swab16 is
actually returning an `int` not a `u16` due to integer promotion.

As Al Viro notes:
You *can't* get smaller-than-int out of ? :, same as you can't get it
out of addition, etc.

This also fixes some clang -Wformat warnings involving default
argument promotion.

Link: https://github.com/ClangBuiltLinux/linux/issues/378
Link: https://lkml.kernel.org/r/20220608223539.470472-1-justinstitt@google.com
Signed-off-by: Justin Stitt <jstitt007@gmail.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/swab.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 7272f85d6d6a..0723a9cce747 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -102,7 +102,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
 #else
 #define __swab16(x)				\
-	(__builtin_constant_p((__u16)(x)) ?	\
+	(__u16)(__builtin_constant_p(x) ?	\
 	___constant_swab16(x) :			\
 	__fswab16(x))
 #endif
@@ -115,7 +115,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
 #else
 #define __swab32(x)				\
-	(__builtin_constant_p((__u32)(x)) ?	\
+	(__u32)(__builtin_constant_p(x) ?	\
 	___constant_swab32(x) :			\
 	__fswab32(x))
 #endif
@@ -128,7 +128,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
 #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x))
 #else
 #define __swab64(x)				\
-	(__builtin_constant_p((__u64)(x)) ?	\
+	(__u64)(__builtin_constant_p(x) ?	\
 	___constant_swab64(x) :			\
 	__fswab64(x))
 #endif
-- 
cgit v1.2.3


From ac80287a6af9fc3f3d189d6d1f523889a0a9e1bc Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 15 Jun 2022 16:48:42 +0300
Subject: bpf: Fix documentation of th_len in bpf_tcp_{gen,check}_syncookie

bpf_tcp_gen_syncookie expects the full length of the TCP header (with
all options), and bpf_tcp_check_syncookie accepts lengths bigger than
sizeof(struct tcphdr). Fix the documentation that says these lengths
should be exactly sizeof(struct tcphdr).

While at it, fix a typo in the name of struct ipv6hdr.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-2-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 10 ++++++----
 tools/include/uapi/linux/bpf.h | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..f545e39df72a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3597,10 +3597,11 @@ union bpf_attr {
  *
  * 		*iph* points to the start of the IPv4 or IPv6 header, while
  * 		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- * 		**sizeof**\ (**struct ip6hdr**).
+ * 		**sizeof**\ (**struct ipv6hdr**).
  *
  * 		*th* points to the start of the TCP header, while *th_len*
- * 		contains **sizeof**\ (**struct tcphdr**).
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  * 	Return
  * 		0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
  * 		error otherwise.
@@ -3783,10 +3784,11 @@ union bpf_attr {
  *
  *		*iph* points to the start of the IPv4 or IPv6 header, while
  *		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- *		**sizeof**\ (**struct ip6hdr**).
+ *		**sizeof**\ (**struct ipv6hdr**).
  *
  *		*th* points to the start of the TCP header, while *th_len*
- *		contains the length of the TCP header.
+ *		contains the length of the TCP header with options (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  *	Return
  *		On success, lower 32 bits hold the generated SYN cookie in
  *		followed by 16 bits which hold the MSS value for that cookie,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62d..f545e39df72a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3597,10 +3597,11 @@ union bpf_attr {
  *
  * 		*iph* points to the start of the IPv4 or IPv6 header, while
  * 		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- * 		**sizeof**\ (**struct ip6hdr**).
+ * 		**sizeof**\ (**struct ipv6hdr**).
  *
  * 		*th* points to the start of the TCP header, while *th_len*
- * 		contains **sizeof**\ (**struct tcphdr**).
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  * 	Return
  * 		0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
  * 		error otherwise.
@@ -3783,10 +3784,11 @@ union bpf_attr {
  *
  *		*iph* points to the start of the IPv4 or IPv6 header, while
  *		*iph_len* contains **sizeof**\ (**struct iphdr**) or
- *		**sizeof**\ (**struct ip6hdr**).
+ *		**sizeof**\ (**struct ipv6hdr**).
  *
  *		*th* points to the start of the TCP header, while *th_len*
- *		contains the length of the TCP header.
+ *		contains the length of the TCP header with options (at least
+ *		**sizeof**\ (**struct tcphdr**)).
  *	Return
  *		On success, lower 32 bits hold the generated SYN cookie in
  *		followed by 16 bits which hold the MSS value for that cookie,
-- 
cgit v1.2.3


From 33bf9885040c399cf6a95bd33216644126728e14 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Wed, 15 Jun 2022 16:48:44 +0300
Subject: bpf: Add helpers to issue and check SYN cookies in XDP

The new helpers bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6} allow an XDP
program to generate SYN cookies in response to TCP SYN packets and to
check those cookies upon receiving the first ACK packet (the final
packet of the TCP handshake).

Unlike bpf_tcp_{gen,check}_syncookie these new helpers don't need a
listening socket on the local machine, which allows to use them together
with synproxy to accelerate SYN cookie generation.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20220615134847.3753567-4-maximmi@nvidia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h              |   1 +
 include/uapi/linux/bpf.h       |  78 +++++++++++++++++++++++++++
 net/core/filter.c              | 118 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c           |   3 +-
 scripts/bpf_doc.py             |   4 ++
 tools/include/uapi/linux/bpf.h |  78 +++++++++++++++++++++++++++
 6 files changed, 281 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1e99f5c61f84..9a1efe23fab7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -432,6 +432,7 @@ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
 			 struct tcphdr *th, u32 *cookie);
 u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
 			 struct tcphdr *th, u32 *cookie);
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss);
 u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
 			  const struct tcp_request_sock_ops *af_ops,
 			  struct sock *sk, struct tcphdr *th);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f545e39df72a..e81362891596 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5251,6 +5251,80 @@ union bpf_attr {
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
  *		is out of bounds.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv4/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv6/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5457,6 +5531,10 @@ union bpf_attr {
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
 	FN(dynptr_data),		\
+	FN(tcp_raw_gen_syncookie_ipv4),	\
+	FN(tcp_raw_gen_syncookie_ipv6),	\
+	FN(tcp_raw_check_syncookie_ipv4),	\
+	FN(tcp_raw_check_syncookie_ipv6),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 5af58eb48587..b62d4126a561 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7444,6 +7444,114 @@ static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SYN_COOKIES
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
+	   struct tcphdr *, th, u32, th_len)
+{
+	u32 cookie;
+	u16 mss;
+
+	if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+		return -EINVAL;
+
+	mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
+	cookie = __cookie_v4_init_sequence(iph, th, &mss);
+
+	return cookie | ((u64)mss << 32);
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
+	.func		= bpf_tcp_raw_gen_syncookie_ipv4,
+	.gpl_only	= true, /* __cookie_v4_init_sequence() is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct iphdr),
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
+	   struct tcphdr *, th, u32, th_len)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+	const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+		sizeof(struct ipv6hdr);
+	u32 cookie;
+	u16 mss;
+
+	if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+		return -EINVAL;
+
+	mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
+	cookie = __cookie_v6_init_sequence(iph, th, &mss);
+
+	return cookie | ((u64)mss << 32);
+#else
+	return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
+	.func		= bpf_tcp_raw_gen_syncookie_ipv6,
+	.gpl_only	= true, /* __cookie_v6_init_sequence() is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct ipv6hdr),
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
+	   struct tcphdr *, th)
+{
+	u32 cookie = ntohl(th->ack_seq) - 1;
+
+	if (__cookie_v4_check(iph, th, cookie) > 0)
+		return 0;
+
+	return -EACCES;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
+	.func		= bpf_tcp_raw_check_syncookie_ipv4,
+	.gpl_only	= true, /* __cookie_v4_check is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct iphdr),
+	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg2_size	= sizeof(struct tcphdr),
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
+	   struct tcphdr *, th)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+	u32 cookie = ntohl(th->ack_seq) - 1;
+
+	if (__cookie_v6_check(iph, th, cookie) > 0)
+		return 0;
+
+	return -EACCES;
+#else
+	return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
+	.func		= bpf_tcp_raw_check_syncookie_ipv6,
+	.gpl_only	= true, /* __cookie_v6_check is GPL */
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg1_size	= sizeof(struct ipv6hdr),
+	.arg2_type	= ARG_PTR_TO_FIXED_SIZE_MEM,
+	.arg2_size	= sizeof(struct tcphdr),
+};
+#endif /* CONFIG_SYN_COOKIES */
+
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -7856,6 +7964,16 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
+#ifdef CONFIG_SYN_COOKIES
+	case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+		return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+	case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+		return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+	case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+		return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+	case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+		return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e2a9ece9af2..6426f6a2e744 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3967,7 +3967,7 @@ static bool smc_parse_options(const struct tcphdr *th,
 /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
  * value on success.
  */
-static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
 {
 	const unsigned char *ptr = (const unsigned char *)(th + 1);
 	int length = (th->doff * 4) - sizeof(struct tcphdr);
@@ -4006,6 +4006,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
 	}
 	return mss;
 }
+EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
 
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 855b937e7585..a0ec321469bd 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -635,6 +635,8 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'struct iphdr',
+            'struct ipv6hdr',
     ]
     known_types = {
             '...',
@@ -686,6 +688,8 @@ class PrinterHelpers(Printer):
             'struct bpf_timer',
             'struct mptcp_sock',
             'struct bpf_dynptr',
+            'struct iphdr',
+            'struct ipv6hdr',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f545e39df72a..e81362891596 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5251,6 +5251,80 @@ union bpf_attr {
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
  *		is out of bounds.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv4/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len)
+ *	Description
+ *		Try to issue a SYN cookie for the packet with corresponding
+ *		IPv6/TCP headers, *iph* and *th*, without depending on a
+ *		listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the start of the TCP header, while *th_len*
+ *		contains the length of the TCP header (at least
+ *		**sizeof**\ (**struct tcphdr**)).
+ *	Return
+ *		On success, lower 32 bits hold the generated SYN cookie in
+ *		followed by 16 bits which hold the MSS value for that cookie,
+ *		and the top 16 bits are unused.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if *th_len* is invalid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv4 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th)
+ *	Description
+ *		Check whether *iph* and *th* contain a valid SYN cookie ACK
+ *		without depending on a listening socket.
+ *
+ *		*iph* points to the IPv6 header.
+ *
+ *		*th* points to the TCP header.
+ *	Return
+ *		0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EACCES** if the SYN cookie is not valid.
+ *
+ *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5457,6 +5531,10 @@ union bpf_attr {
 	FN(dynptr_read),		\
 	FN(dynptr_write),		\
 	FN(dynptr_data),		\
+	FN(tcp_raw_gen_syncookie_ipv4),	\
+	FN(tcp_raw_gen_syncookie_ipv6),	\
+	FN(tcp_raw_check_syncookie_ipv4),	\
+	FN(tcp_raw_check_syncookie_ipv6),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 5374d8fb75f313294c7d97e85c22bead34d63f2b Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 19 May 2022 08:11:46 +0100
Subject: media: Add P010 video format

P010 is a YUV format with 10-bits per component with interleaved UV.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst  | 54 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 4 files changed, 57 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index 8dff5906639b..a900ff66911a 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -109,6 +109,13 @@ All components are stored with the same number of bits per component.
       - Cb, Cr
       - No
       - 16x16 tiles
+    * - V4L2_PIX_FMT_P010
+      - 'P010'
+      - 10
+      - 4:2:0
+      - Cb, Cr
+      - Yes
+      - Linear
     * - V4L2_PIX_FMT_NV16
       - 'NV16'
       - 8
@@ -171,6 +178,7 @@ horizontally.
 .. _V4L2-PIX-FMT-NV21:
 .. _V4L2-PIX-FMT-NV12M:
 .. _V4L2-PIX-FMT-NV21M:
+.. _V4L2-PIX-FMT-P010:
 
 NV12, NV21, NV12M and NV21M
 ---------------------------
@@ -519,6 +527,52 @@ number of lines as the luma plane.
       - Cb\ :sub:`33`
       - Cr\ :sub:`33`
 
+.. _V4L2_PIX_FMT_P010:
+
+P010
+----
+
+Like NV12 with 10 bits per component, expanded to 16 bits.
+Data in the 10 high bits, zeros in the 6 low bits, arranged in little endian order.
+
+.. flat-table:: Sample 4x4 P010 Image
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - start + 0:
+      - Y'\ :sub:`00`
+      - Y'\ :sub:`01`
+      - Y'\ :sub:`02`
+      - Y'\ :sub:`03`
+    * - start + 8:
+      - Y'\ :sub:`10`
+      - Y'\ :sub:`11`
+      - Y'\ :sub:`12`
+      - Y'\ :sub:`13`
+    * - start + 16:
+      - Y'\ :sub:`20`
+      - Y'\ :sub:`21`
+      - Y'\ :sub:`22`
+      - Y'\ :sub:`23`
+    * - start + 24:
+      - Y'\ :sub:`30`
+      - Y'\ :sub:`31`
+      - Y'\ :sub:`32`
+      - Y'\ :sub:`33`
+    * - start + 32:
+      - Cb\ :sub:`00`
+      - Cr\ :sub:`00`
+      - Cb\ :sub:`01`
+      - Cr\ :sub:`01`
+    * - start + 40:
+      - Cb\ :sub:`10`
+      - Cr\ :sub:`10`
+      - Cb\ :sub:`11`
+      - Cr\ :sub:`11`
+
+.. raw:: latex
+
+    \endgroup
 
 Fully Planar YUV Formats
 ========================
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index df34b2a283bc..1e38ad8906a2 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -266,6 +266,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_NV61,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV24,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_NV42,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_P010,    .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 },
 
 		{ .format = V4L2_PIX_FMT_YUV410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
 		{ .format = V4L2_PIX_FMT_YVU410,  .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 21470de62d72..e2526701294e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1306,6 +1306,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV61:		descr = "Y/CrCb 4:2:2"; break;
 	case V4L2_PIX_FMT_NV24:		descr = "Y/CbCr 4:4:4"; break;
 	case V4L2_PIX_FMT_NV42:		descr = "Y/CrCb 4:4:4"; break;
+	case V4L2_PIX_FMT_P010:		descr = "10-bit Y/CrCb 4:2:0"; break;
 	case V4L2_PIX_FMT_NV12_4L4:	descr = "Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_16L16:	descr = "Y/CbCr 4:2:0 (16x16 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_32L32:   descr = "Y/CbCr 4:2:0 (32x32 Linear)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 343b95107fce..5311ac4fde35 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -602,6 +602,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV61    v4l2_fourcc('N', 'V', '6', '1') /* 16  Y/CrCb 4:2:2  */
 #define V4L2_PIX_FMT_NV24    v4l2_fourcc('N', 'V', '2', '4') /* 24  Y/CbCr 4:4:4  */
 #define V4L2_PIX_FMT_NV42    v4l2_fourcc('N', 'V', '4', '2') /* 24  Y/CrCb 4:4:4  */
+#define V4L2_PIX_FMT_P010    v4l2_fourcc('P', '0', '1', '0') /* 24  Y/CbCr 4:2:0 10-bit per component */
 
 /* two non contiguous planes - one Y, one Cr + Cb interleaved  */
 #define V4L2_PIX_FMT_NV12M   v4l2_fourcc('N', 'M', '1', '2') /* 12  Y/CbCr 4:2:0  */
-- 
cgit v1.2.3


From 7b0a0e3c3a88260b6fcb017e49f198463aa62ed1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 14 Apr 2022 16:50:57 +0200
Subject: wifi: cfg80211: do some rework towards MLO link APIs

In order to support multi-link operation with multiple links,
start adding some APIs. The notable addition here is to have
the link ID in a new nl80211 attribute, that will be used to
differentiate the links in many nl80211 operations.

So far, this patch adds the netlink NL80211_ATTR_MLO_LINK_ID
attribute (as well as the NL80211_ATTR_MLO_LINKS attribute)
and plugs it through the system in some places, checking the
validity etc. along with other infrastructure needed for it.

For now, I've decided to include only the over-the-air link
ID in the API. I know we discussed that we eventually need to
have to have other ways of identifying a link, but for local
AP mode and auth/assoc commands as well as set_key etc. we'll
use the OTA ID.

Also included in this patch is some refactoring of the data
structures in struct wireless_dev, splitting for the first
time the data into type dependent pieces, to make reasoning
about these things easier.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   6 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c        |   9 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   4 +-
 drivers/net/wireless/marvell/libertas/mesh.c       |  10 +-
 drivers/net/wireless/marvell/mwifiex/11h.c         |   2 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    |  18 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c |   3 +-
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  |  14 +-
 drivers/net/wireless/quantenna/qtnfmac/commands.c  |   2 +-
 drivers/net/wireless/quantenna/qtnfmac/event.c     |  15 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |   4 +-
 include/linux/ieee80211.h                          |   3 +
 include/net/cfg80211.h                             |  99 +++-
 include/uapi/linux/nl80211.h                       |  28 +
 net/mac80211/cfg.c                                 |   8 +-
 net/mac80211/mlme.c                                |   2 +-
 net/wireless/ap.c                                  |  46 +-
 net/wireless/chan.c                                | 196 +++++--
 net/wireless/core.c                                |  28 +-
 net/wireless/core.h                                |  13 +-
 net/wireless/ibss.c                                |  57 +-
 net/wireless/mesh.c                                |  31 +-
 net/wireless/mlme.c                                |  74 +--
 net/wireless/nl80211.c                             | 623 +++++++++++++++------
 net/wireless/ocb.c                                 |   5 +-
 net/wireless/rdev-ops.h                            |  32 +-
 net/wireless/reg.c                                 | 139 +++--
 net/wireless/scan.c                                |   8 +-
 net/wireless/sme.c                                 | 102 ++--
 net/wireless/trace.h                               |  86 ++-
 net/wireless/util.c                                |  44 +-
 net/wireless/wext-compat.c                         |  48 +-
 net/wireless/wext-sme.c                            |  29 +-
 33 files changed, 1255 insertions(+), 533 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index bd1183830e91..33ed54738d47 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1119,7 +1119,7 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
 					NL80211_CHAN_HT20 : NL80211_CHAN_NO_HT);
 
 	mutex_lock(&vif->wdev.mtx);
-	cfg80211_ch_switch_notify(vif->ndev, &chandef);
+	cfg80211_ch_switch_notify(vif->ndev, &chandef, 0);
 	mutex_unlock(&vif->wdev.mtx);
 }
 
@@ -2967,7 +2967,8 @@ static int ath6kl_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	return ath6kl_set_ies(vif, beacon);
 }
 
-static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int ath6kl_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			  unsigned int link_id)
 {
 	struct ath6kl *ar = ath6kl_priv(dev);
 	struct ath6kl_vif *vif = netdev_priv(dev);
@@ -3368,6 +3369,7 @@ static int ath6kl_cfg80211_sscan_stop(struct wiphy *wiphy,
 
 static int ath6kl_cfg80211_set_bitrate(struct wiphy *wiphy,
 				       struct net_device *dev,
+				       unsigned int link_id,
 				       const u8 *addr,
 				       const struct cfg80211_bitrate_mask *mask)
 {
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 8f2638f5b87b..f93bdffa4d1d 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -2098,8 +2098,8 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 			     bcon->tail_len))
 		privacy = 1;
 
-	memcpy(vif->ssid, wdev->ssid, wdev->ssid_len);
-	vif->ssid_len = wdev->ssid_len;
+	memcpy(vif->ssid, wdev->u.ap.ssid, wdev->u.ap.ssid_len);
+	vif->ssid_len = wdev->u.ap.ssid_len;
 
 	/* in case privacy has changed, need to restart the AP */
 	if (vif->privacy != privacy) {
@@ -2108,7 +2108,7 @@ static int wil_cfg80211_change_beacon(struct wiphy *wiphy,
 
 		rc = _wil_cfg80211_start_ap(wiphy, ndev, vif->ssid,
 					    vif->ssid_len, privacy,
-					    wdev->beacon_interval,
+					    wdev->links[0].ap.beacon_interval,
 					    vif->channel,
 					    vif->wmi_edmg_channel, bcon,
 					    vif->hidden_ssid,
@@ -2186,7 +2186,8 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 }
 
 static int wil_cfg80211_stop_ap(struct wiphy *wiphy,
-				struct net_device *ndev)
+				struct net_device *ndev,
+				unsigned int link_id)
 {
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	struct wil6210_vif *vif = ndev_to_vif(ndev);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 605206abe424..11e1f07f83e0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -4965,7 +4965,8 @@ exit:
 	return err;
 }
 
-static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev)
+static int brcmf_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
+				  unsigned int link_id)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct brcmf_if *ifp = netdev_priv(ndev);
@@ -5302,6 +5303,7 @@ exit:
 
 static int brcmf_cfg80211_get_channel(struct wiphy *wiphy,
 				      struct wireless_dev *wdev,
+				      unsigned int link_id,
 				      struct cfg80211_chan_def *chandef)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
diff --git a/drivers/net/wireless/marvell/libertas/mesh.c b/drivers/net/wireless/marvell/libertas/mesh.c
index a58c1e141f2c..90ffe8d1e0e8 100644
--- a/drivers/net/wireless/marvell/libertas/mesh.c
+++ b/drivers/net/wireless/marvell/libertas/mesh.c
@@ -109,9 +109,9 @@ static int lbs_mesh_config(struct lbs_private *priv, uint16_t action,
 
 		if (priv->mesh_dev) {
 			mesh_wdev = priv->mesh_dev->ieee80211_ptr;
-			ie->val.mesh_id_len = mesh_wdev->mesh_id_up_len;
-			memcpy(ie->val.mesh_id, mesh_wdev->ssid,
-						mesh_wdev->mesh_id_up_len);
+			ie->val.mesh_id_len = mesh_wdev->u.mesh.id_up_len;
+			memcpy(ie->val.mesh_id, mesh_wdev->u.mesh.id,
+						mesh_wdev->u.mesh.id_up_len);
 		}
 
 		ie->len = sizeof(struct mrvl_meshie_val) -
@@ -986,8 +986,8 @@ static int lbs_add_mesh(struct lbs_private *priv)
 	mesh_wdev->wiphy = priv->wdev->wiphy;
 
 	if (priv->mesh_tlv) {
-		sprintf(mesh_wdev->ssid, "mesh");
-		mesh_wdev->mesh_id_up_len = 4;
+		sprintf(mesh_wdev->u.mesh.id, "mesh");
+		mesh_wdev->u.mesh.id_up_len = 4;
 	}
 
 	mesh_wdev->netdev = mesh_dev;
diff --git a/drivers/net/wireless/marvell/mwifiex/11h.c b/drivers/net/wireless/marvell/mwifiex/11h.c
index 3fa25cd64cda..4ca8d0135708 100644
--- a/drivers/net/wireless/marvell/mwifiex/11h.c
+++ b/drivers/net/wireless/marvell/mwifiex/11h.c
@@ -304,6 +304,6 @@ void mwifiex_dfs_chan_sw_work_queue(struct work_struct *work)
 	mwifiex_dbg(priv->adapter, MSG,
 		    "indicating channel switch completion to kernel\n");
 	mutex_lock(&priv->wdev.mtx);
-	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef);
+	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0);
 	mutex_unlock(&priv->wdev.mtx);
 }
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 6f23ec34e2e2..d68c40e0e122 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1753,10 +1753,12 @@ mwifiex_mgmt_stypes[NUM_NL80211_IFTYPES] = {
  * Function configures data rates to firmware using bitrate mask
  * provided by cfg80211.
  */
-static int mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy,
-				struct net_device *dev,
-				const u8 *peer,
-				const struct cfg80211_bitrate_mask *mask)
+static int
+mwifiex_cfg80211_set_bitrate_mask(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  unsigned int link_id,
+				  const u8 *peer,
+				  const struct cfg80211_bitrate_mask *mask)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 	u16 bitmap_rates[MAX_BITMAP_RATES_SIZE];
@@ -1998,7 +2000,8 @@ mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
 /* cfg80211 operation handler for stop ap.
  * Function stops BSS running at uAP interface.
  */
-static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+				    unsigned int link_id)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 
@@ -2421,7 +2424,7 @@ mwifiex_cfg80211_connect(struct wiphy *wiphy, struct net_device *dev,
 		return -EINVAL;
 	}
 
-	if (priv->wdev.current_bss) {
+	if (priv->wdev.connected) {
 		mwifiex_dbg(adapter, ERROR,
 			    "%s: already connected\n", dev->name);
 		return -EALREADY;
@@ -2649,7 +2652,7 @@ mwifiex_cfg80211_scan(struct wiphy *wiphy,
 		return -EBUSY;
 	}
 
-	if (!priv->wdev.current_bss && priv->scan_block)
+	if (!priv->wdev.connected && priv->scan_block)
 		priv->scan_block = false;
 
 	if (!mwifiex_stop_bg_scan(priv))
@@ -4025,6 +4028,7 @@ mwifiex_cfg80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 static int mwifiex_cfg80211_get_channel(struct wiphy *wiphy,
 					struct wireless_dev *wdev,
+					unsigned int link_id,
 					struct cfg80211_chan_def *chandef)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(wdev->netdev);
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index 1ac4684fab25..5c2c7f1dbffd 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -1426,7 +1426,8 @@ static int change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	return wilc_add_beacon(vif, 0, 0, beacon);
 }
 
-static int stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int stop_ap(struct wiphy *wiphy, struct net_device *dev,
+		   unsigned int link_id)
 {
 	int ret;
 	struct wilc_vif *vif = netdev_priv(dev);
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 84b15a655eab..1593e810b3ca 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -352,7 +352,8 @@ static int qtnf_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	return ret;
 }
 
-static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			unsigned int link_id)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
@@ -500,7 +501,7 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev,
 
 	switch (vif->wdev.iftype) {
 	case NL80211_IFTYPE_STATION:
-		if (idx != 0 || !vif->wdev.current_bss)
+		if (idx != 0 || !vif->wdev.connected)
 			return -ENOENT;
 
 		ether_addr_copy(mac, vif->bssid);
@@ -729,7 +730,7 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
 		pr_err("VIF%u.%u: failed to disconnect\n",
 		       mac->macid, vif->vifid);
 
-	if (vif->wdev.current_bss) {
+	if (vif->wdev.connected) {
 		netif_carrier_off(vif->netdev);
 		cfg80211_disconnected(vif->netdev, reason_code,
 				      NULL, 0, true, GFP_KERNEL);
@@ -745,10 +746,11 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 	struct qtnf_wmac *mac = wiphy_priv(wiphy);
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct ieee80211_supported_band *sband;
-	const struct cfg80211_chan_def *chandef = &wdev->chandef;
+	const struct cfg80211_chan_def *chandef = wdev_chandef(wdev, 0);
 	struct ieee80211_channel *chan;
 	int ret;
 
+
 	sband = wiphy->bands[NL80211_BAND_2GHZ];
 	if (sband && idx >= sband->n_channels) {
 		idx -= sband->n_channels;
@@ -765,7 +767,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 	survey->channel = chan;
 	survey->filled = 0x0;
 
-	if (chan == chandef->chan)
+	if (chandef && chan == chandef->chan)
 		survey->filled = SURVEY_INFO_IN_USE;
 
 	ret = qtnf_cmd_get_chan_stats(mac, chan->center_freq, survey);
@@ -778,7 +780,7 @@ qtnf_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 
 static int
 qtnf_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
-		 struct cfg80211_chan_def *chandef)
+		 unsigned int link_id, struct cfg80211_chan_def *chandef)
 {
 	struct net_device *ndev = wdev->netdev;
 	struct qtnf_vif *vif;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index c68563c83098..3d734a7a5ba8 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -2005,7 +2005,7 @@ int qtnf_cmd_send_scan(struct qtnf_wmac *mac)
 		dwell_active = scan_req->duration;
 		dwell_passive = scan_req->duration;
 	} else if (wdev->iftype == NL80211_IFTYPE_STATION &&
-		   wdev->current_bss) {
+		   wdev->connected) {
 		/* let device select dwell based on traffic conditions */
 		dwell_active = QTNF_SCAN_TIME_AUTO;
 		dwell_passive = QTNF_SCAN_TIME_AUTO;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index 8dc80574d08d..4fafe370101a 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -189,7 +189,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 			vif->mac->macid, vif->vifid,
 			join_info->bssid, chandef.chan->hw_value);
 
-		if (!vif->wdev.ssid_len) {
+		if (!vif->wdev.u.client.ssid_len) {
 			pr_warn("VIF%u.%u: SSID unknown for BSS:%pM\n",
 				vif->mac->macid, vif->vifid,
 				join_info->bssid);
@@ -197,7 +197,7 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 			goto done;
 		}
 
-		ie = kzalloc(2 + vif->wdev.ssid_len, GFP_KERNEL);
+		ie = kzalloc(2 + vif->wdev.u.client.ssid_len, GFP_KERNEL);
 		if (!ie) {
 			pr_warn("VIF%u.%u: IE alloc failed for BSS:%pM\n",
 				vif->mac->macid, vif->vifid,
@@ -207,14 +207,15 @@ qtnf_event_handle_bss_join(struct qtnf_vif *vif,
 		}
 
 		ie[0] = WLAN_EID_SSID;
-		ie[1] = vif->wdev.ssid_len;
-		memcpy(ie + 2, vif->wdev.ssid, vif->wdev.ssid_len);
+		ie[1] = vif->wdev.u.client.ssid_len;
+		memcpy(ie + 2, vif->wdev.u.client.ssid,
+		       vif->wdev.u.client.ssid_len);
 
 		bss = cfg80211_inform_bss(wiphy, chandef.chan,
 					  CFG80211_BSS_FTYPE_UNKNOWN,
 					  join_info->bssid, 0,
 					  WLAN_CAPABILITY_ESS, 100,
-					  ie, 2 + vif->wdev.ssid_len,
+					  ie, 2 + vif->wdev.u.client.ssid_len,
 					  0, GFP_KERNEL);
 		if (!bss) {
 			pr_warn("VIF%u.%u: can't connect to unknown BSS: %pM\n",
@@ -470,14 +471,14 @@ qtnf_event_handle_freq_change(struct qtnf_wmac *mac,
 			continue;
 
 		if (vif->wdev.iftype == NL80211_IFTYPE_STATION &&
-		    !vif->wdev.current_bss)
+		    !vif->wdev.connected)
 			continue;
 
 		if (!vif->netdev)
 			continue;
 
 		mutex_lock(&vif->wdev.mtx);
-		cfg80211_ch_switch_notify(vif->netdev, &chandef);
+		cfg80211_ch_switch_notify(vif->netdev, &chandef, 0);
 		mutex_unlock(&vif->wdev.mtx);
 	}
 
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 43b5604c0bca..349aa3c4b668 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -2086,6 +2086,7 @@ static u8 rtw_get_chan_type(struct adapter *adapter)
 }
 
 static int cfg80211_rtw_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
+				    unsigned int link_id,
 				    struct cfg80211_chan_def *chandef)
 {
 	struct adapter *adapter = wiphy_to_adapter(wiphy);
@@ -2446,7 +2447,8 @@ static int cfg80211_rtw_change_beacon(struct wiphy *wiphy, struct net_device *nd
 	return rtw_add_beacon(adapter, info->head, info->head_len, info->tail, info->tail_len);
 }
 
-static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev)
+static int cfg80211_rtw_stop_ap(struct wiphy *wiphy, struct net_device *ndev,
+				unsigned int link_id)
 {
 	return 0;
 }
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5c65ae6b8154..e15771965916 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4376,4 +4376,7 @@ enum ieee80211_range_params_max_total_ltf {
 	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED,
 };
 
+/* multi-link device */
+#define IEEE80211_MLD_MAX_NUM_LINKS	15
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6d02e12e4702..772e099fc932 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1158,6 +1158,7 @@ struct cfg80211_mbssid_elems {
 
 /**
  * struct cfg80211_beacon_data - beacon data
+ * @link_id: the link ID for the AP MLD link sending this beacon
  * @head: head portion of beacon (before TIM IE)
  *	or %NULL if not changed
  * @tail: tail portion of beacon (after TIM IE)
@@ -1188,6 +1189,8 @@ struct cfg80211_mbssid_elems {
  *	attribute is present in beacon data or not.
  */
 struct cfg80211_beacon_data {
+	unsigned int link_id;
+
 	const u8 *head, *tail;
 	const u8 *beacon_ies;
 	const u8 *proberesp_ies;
@@ -4201,7 +4204,8 @@ struct cfg80211_ops {
 			    struct cfg80211_ap_settings *settings);
 	int	(*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
 				 struct cfg80211_beacon_data *info);
-	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev);
+	int	(*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
+			   unsigned int link_id);
 
 
 	int	(*add_station)(struct wiphy *wiphy, struct net_device *dev,
@@ -4309,6 +4313,7 @@ struct cfg80211_ops {
 
 	int	(*set_bitrate_mask)(struct wiphy *wiphy,
 				    struct net_device *dev,
+				    unsigned int link_id,
 				    const u8 *peer,
 				    const struct cfg80211_bitrate_mask *mask);
 
@@ -4384,6 +4389,7 @@ struct cfg80211_ops {
 
 	int	(*get_channel)(struct wiphy *wiphy,
 			       struct wireless_dev *wdev,
+			       unsigned int link_id,
 			       struct cfg80211_chan_def *chandef);
 
 	int	(*start_p2p_device)(struct wiphy *wiphy,
@@ -4420,6 +4426,7 @@ struct cfg80211_ops {
 			       struct cfg80211_qos_map *qos_map);
 
 	int	(*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev,
+				    unsigned int link_id,
 				    struct cfg80211_chan_def *chandef);
 
 	int	(*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
@@ -4545,10 +4552,14 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_HAS_STATIC_WEP: The device supports static WEP key installation
  *	before connection.
  * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
+ * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs,
+ *	in order to not have them reachable in normal drivers, until we have
+ *	complete feature/interface combinations/etc. advertisement. No driver
+ *	should set this flag for now.
  */
 enum wiphy_flags {
 	WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK		= BIT(0),
-	/* use hole at 1 */
+	WIPHY_FLAG_SUPPORTS_MLO			= BIT(1),
 	WIPHY_FLAG_SPLIT_SCAN_6GHZ		= BIT(2),
 	WIPHY_FLAG_NETNS_OK			= BIT(3),
 	WIPHY_FLAG_PS_ON_BY_DEFAULT		= BIT(4),
@@ -5505,6 +5516,8 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @netdev: (private) Used to reference back to the netdev, may be %NULL
  * @identifier: (private) Identifier used in nl80211 to identify this
  *	wireless device if it has no netdev
+ * @connected_addr: (private) BSSID or AP MLD address if connected
+ * @connected: indicates if connected or not (STA mode)
  * @current_bss: (private) Used by the internal configuration code
  * @chandef: (private) Used by the internal configuration code to track
  *	the user-set channel definition.
@@ -5585,8 +5598,6 @@ struct wireless_dev {
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
 
 	/* currently used for IBSS and SME - might be rearranged later */
-	u8 ssid[IEEE80211_MAX_SSID_LEN];
-	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
 	enum ieee80211_bss_type conn_bss_type;
@@ -5598,20 +5609,17 @@ struct wireless_dev {
 	struct list_head event_list;
 	spinlock_t event_lock;
 
-	struct cfg80211_internal_bss *current_bss; /* associated / joined */
-	struct cfg80211_chan_def preset_chandef;
-	struct cfg80211_chan_def chandef;
+	u8 connected:1;
 
 	bool ps;
 	int ps_timeout;
 
-	int beacon_interval;
-
 	u32 ap_unexpected_nlportid;
 
 	u32 owner_nlportid;
 	bool nl_owner_dead;
 
+	/* FIXME: need to rework radar detection for MLO */
 	bool cac_started;
 	unsigned long cac_start_time;
 	unsigned int cac_time_ms;
@@ -5639,6 +5647,50 @@ struct wireless_dev {
 	struct work_struct pmsr_free_wk;
 
 	unsigned long unprot_beacon_reported;
+
+	union {
+		struct {
+			u8 connected_addr[ETH_ALEN] __aligned(2);
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} client;
+		struct {
+			int beacon_interval;
+			struct cfg80211_chan_def preset_chandef;
+			struct cfg80211_chan_def chandef;
+			u8 id[IEEE80211_MAX_SSID_LEN];
+			u8 id_len, id_up_len;
+		} mesh;
+		struct {
+			struct cfg80211_chan_def preset_chandef;
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} ap;
+		struct {
+			struct cfg80211_internal_bss *current_bss;
+			struct cfg80211_chan_def chandef;
+			int beacon_interval;
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+		} ibss;
+		struct {
+			struct cfg80211_chan_def chandef;
+		} ocb;
+	} u;
+
+	struct {
+		u8 addr[ETH_ALEN] __aligned(2);
+		union {
+			struct {
+				unsigned int beacon_interval;
+				struct cfg80211_chan_def chandef;
+			} ap;
+			struct {
+				struct cfg80211_internal_bss *current_bss;
+			} client;
+		};
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
+	u16 valid_links;
 };
 
 static inline const u8 *wdev_address(struct wireless_dev *wdev)
@@ -5667,6 +5719,31 @@ static inline void *wdev_priv(struct wireless_dev *wdev)
 	return wiphy_priv(wdev->wiphy);
 }
 
+/**
+ * wdev_chandef - return chandef pointer from wireless_dev
+ * @wdev: the wdev
+ * @link_id: the link ID for MLO
+ *
+ * Return: The chandef depending on the mode, or %NULL.
+ */
+struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
+				       unsigned int link_id);
+
+static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
+					unsigned int link_id)
+{
+	WARN_ON(link_id && !wdev->valid_links);
+	WARN_ON(wdev->valid_links &&
+		!(wdev->valid_links & BIT(link_id)));
+}
+
+#define for_each_valid_link(wdev, link_id)					\
+	for (link_id = 0;							\
+	     link_id < ((wdev)->valid_links ? ARRAY_SIZE((wdev)->links) : 1);	\
+	     link_id++)								\
+		if (!(wdev)->valid_links ||					\
+		    ((wdev)->valid_links & BIT(link_id)))
+
 /**
  * DOC: Utility functions
  *
@@ -7882,12 +7959,14 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
  * cfg80211_ch_switch_notify - update wdev channel and notify userspace
  * @dev: the device which switched channels
  * @chandef: the new channel definition
+ * @link_id: the link ID for MLO, must be 0 for non-MLO
  *
  * Caller must acquire wdev_lock, therefore must only be called from sleepable
  * driver context!
  */
 void cfg80211_ch_switch_notify(struct net_device *dev,
-			       struct cfg80211_chan_def *chandef);
+			       struct cfg80211_chan_def *chandef,
+			       unsigned int link_id);
 
 /*
  * cfg80211_ch_switch_started_notify - notify channel switch start
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 98f905f16411..a9a2c9fef295 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -323,6 +323,17 @@
  * Once the association is done, the driver cleans the FILS AAD data.
  */
 
+/**
+ * DOC: Multi-Link Operation
+ *
+ * In Multi-Link Operation, a connection between to MLDs utilizes multiple
+ * links. To use this in nl80211, various commands and responses now need
+ * to or will include the new %NL80211_ATTR_MLO_LINKS attribute.
+ * Additionally, various commands that need to operate on a specific link
+ * now need to be given the %NL80211_ATTR_MLO_LINK_ID attribute, e.g. to
+ * use %NL80211_CMD_START_AP or similar functions.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -1237,6 +1248,12 @@
  *      to describe the BSSID address of the AP and %NL80211_ATTR_TIMEOUT to
  *      specify the timeout value.
  *
+ * @NL80211_CMD_ADD_LINK: Add a new link to an interface. The
+ *	%NL80211_ATTR_MLO_LINK_ID attribute is used for the new link.
+ * @NL80211_CMD_REMOVE_LINK: Remove a link from an interface. This may come
+ *	without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links
+ *	in preparation for e.g. roaming to a regular (non-MLO) AP.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1481,6 +1498,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_ASSOC_COMEBACK,
 
+	NL80211_CMD_ADD_LINK,
+	NL80211_CMD_REMOVE_LINK,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2663,6 +2683,11 @@ enum nl80211_commands {
  *	association request when used with NL80211_CMD_NEW_STATION). Can be set
  *	only if %NL80211_STA_FLAG_WME is set.
  *
+ * @NL80211_ATTR_MLO_LINK_ID: A (u8) link ID for use with MLO, to be used with
+ *	various commands that need a link ID to operate.
+ * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some
+ *	per-link information and a link ID.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3177,6 +3202,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_DISABLE_EHT,
 
+	NL80211_ATTR_MLO_LINKS,
+	NL80211_ATTR_MLO_LINK_ID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 881efbfb96f6..362cac9e2135 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1342,7 +1342,8 @@ static void ieee80211_free_next_beacon(struct ieee80211_sub_if_data *sdata)
 	sdata->u.ap.next_beacon = NULL;
 }
 
-static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+			     unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_sub_if_data *vlan;
@@ -3049,6 +3050,7 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy,
 
 static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 				      struct net_device *dev,
+				      unsigned int link_id,
 				      const u8 *addr,
 				      const struct cfg80211_bitrate_mask *mask)
 {
@@ -3390,7 +3392,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
 	if (err)
 		return err;
 
-	cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef);
+	cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef, 0);
 
 	return 0;
 }
@@ -3898,6 +3900,7 @@ unlock:
 
 static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 				     struct wireless_dev *wdev,
+				     unsigned int link_id,
 				     struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
@@ -3958,6 +3961,7 @@ static int ieee80211_set_qos_map(struct wiphy *wiphy,
 
 static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 				      struct net_device *dev,
+				      unsigned int link_id,
 				      struct cfg80211_chan_def *chandef)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6d5ad71ef02c..e0a9b7d63071 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1314,7 +1314,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 
-	cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef);
+	cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef, 0);
 }
 
 void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success)
diff --git a/net/wireless/ap.c b/net/wireless/ap.c
index 550ac9d827fe..e68923200018 100644
--- a/net/wireless/ap.c
+++ b/net/wireless/ap.c
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * Parts of this file are
+ * Copyright (C) 2022 Intel Corporation
+ */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
 #include <net/cfg80211.h>
@@ -7,8 +11,9 @@
 #include "rdev-ops.h"
 
 
-int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, bool notify)
+static int ___cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+			       struct net_device *dev, unsigned int link_id,
+			       bool notify)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
@@ -22,15 +27,16 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EOPNOTSUPP;
 
-	if (!wdev->beacon_interval)
+	if (!wdev->links[link_id].ap.beacon_interval)
 		return -ENOENT;
 
-	err = rdev_stop_ap(rdev, dev);
+	err = rdev_stop_ap(rdev, dev, link_id);
 	if (!err) {
 		wdev->conn_owner_nlportid = 0;
-		wdev->beacon_interval = 0;
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
-		wdev->ssid_len = 0;
+		wdev->links[link_id].ap.beacon_interval = 0;
+		memset(&wdev->links[link_id].ap.chandef, 0,
+		       sizeof(wdev->links[link_id].ap.chandef));
+		wdev->u.ap.ssid_len = 0;
 		rdev_set_qos_map(rdev, dev, NULL);
 		if (notify)
 			nl80211_send_ap_stopped(wdev);
@@ -46,14 +52,36 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 	return err;
 }
 
+int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev, int link_id,
+		       bool notify)
+{
+	unsigned int link;
+	int ret = 0;
+
+	if (link_id >= 0)
+		return ___cfg80211_stop_ap(rdev, dev, link_id, notify);
+
+	for_each_valid_link(dev->ieee80211_ptr, link) {
+		int ret1 = ___cfg80211_stop_ap(rdev, dev, link, notify);
+
+		if (ret1)
+			ret = ret1;
+		/* try the next one also if one errored */
+	}
+
+	return ret;
+}
+
 int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		     struct net_device *dev, bool notify)
+		     struct net_device *dev, int link_id,
+		     bool notify)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
 	wdev_lock(wdev);
-	err = __cfg80211_stop_ap(rdev, dev, notify);
+	err = __cfg80211_stop_ap(rdev, dev, link_id, notify);
 	wdev_unlock(wdev);
 
 	return err;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index f74f176e0d9d..efc2de4bab57 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -672,14 +672,21 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
  * range of chandef.
  */
 bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
-			  struct ieee80211_channel *chan)
+			  struct ieee80211_channel *chan,
+			  bool primary_only)
 {
 	int width;
 	u32 freq;
 
+	if (!chandef->chan)
+		return false;
+
 	if (chandef->chan->center_freq == chan->center_freq)
 		return true;
 
+	if (primary_only)
+		return false;
+
 	width = cfg80211_chandef_get_width(chandef);
 	if (width <= 20)
 		return false;
@@ -704,23 +711,25 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
 
 bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
 {
-	bool active = false;
+	unsigned int link;
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->chandef.chan)
-		return false;
-
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		active = wdev->beacon_interval != 0;
+		for_each_valid_link(wdev, link) {
+			if (wdev->links[link].ap.beacon_interval)
+				return true;
+		}
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		active = wdev->ssid_len != 0;
+		if (wdev->u.ibss.ssid_len)
+			return true;
 		break;
 	case NL80211_IFTYPE_MESH_POINT:
-		active = wdev->mesh_id_len != 0;
+		if (wdev->u.mesh.id_len)
+			return true;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_OCB:
@@ -737,7 +746,35 @@ bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
 		WARN_ON(1);
 	}
 
-	return active;
+	return false;
+}
+
+bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
+			       struct ieee80211_channel *chan,
+			       bool primary_only)
+{
+	unsigned int link;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		for_each_valid_link(wdev, link) {
+			if (cfg80211_is_sub_chan(&wdev->links[link].ap.chandef,
+						 chan, primary_only))
+				return true;
+		}
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_is_sub_chan(&wdev->u.ibss.chandef, chan,
+					    primary_only);
+	case NL80211_IFTYPE_MESH_POINT:
+		return cfg80211_is_sub_chan(&wdev->u.mesh.chandef, chan,
+					    primary_only);
+	default:
+		break;
+	}
+
+	return false;
 }
 
 static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
@@ -752,7 +789,7 @@ static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
 			continue;
 		}
 
-		if (cfg80211_is_sub_chan(&wdev->chandef, chan)) {
+		if (cfg80211_wdev_on_sub_chan(wdev, chan, false)) {
 			wdev_unlock(wdev);
 			return true;
 		}
@@ -772,7 +809,8 @@ cfg80211_offchan_chain_is_active(struct cfg80211_registered_device *rdev,
 	if (!cfg80211_chandef_valid(&rdev->background_radar_chandef))
 		return false;
 
-	return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel);
+	return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel,
+				    false);
 }
 
 bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
@@ -1176,6 +1214,68 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL(cfg80211_chandef_usable);
 
+static bool cfg80211_ir_permissive_check_wdev(enum nl80211_iftype iftype,
+					      struct wireless_dev *wdev,
+					      struct ieee80211_channel *chan)
+{
+	struct ieee80211_channel *other_chan = NULL;
+	unsigned int link_id;
+	int r1, r2;
+
+	for_each_valid_link(wdev, link_id) {
+		if (wdev->iftype == NL80211_IFTYPE_STATION &&
+		    wdev->links[link_id].client.current_bss)
+			other_chan = wdev->links[link_id].client.current_bss->pub.channel;
+
+		/*
+		 * If a GO already operates on the same GO_CONCURRENT channel,
+		 * this one (maybe the same one) can beacon as well. We allow
+		 * the operation even if the station we relied on with
+		 * GO_CONCURRENT is disconnected now. But then we must make sure
+		 * we're not outdoor on an indoor-only channel.
+		 */
+		if (iftype == NL80211_IFTYPE_P2P_GO &&
+		    wdev->iftype == NL80211_IFTYPE_P2P_GO &&
+		    wdev->links[link_id].ap.beacon_interval &&
+		    !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
+			other_chan = wdev->links[link_id].ap.chandef.chan;
+
+		if (!other_chan)
+			continue;
+
+		if (chan == other_chan)
+			return true;
+
+		if (chan->band != NL80211_BAND_5GHZ &&
+		    chan->band != NL80211_BAND_6GHZ)
+			continue;
+
+		r1 = cfg80211_get_unii(chan->center_freq);
+		r2 = cfg80211_get_unii(other_chan->center_freq);
+
+		if (r1 != -EINVAL && r1 == r2) {
+			/*
+			 * At some locations channels 149-165 are considered a
+			 * bundle, but at other locations, e.g., Indonesia,
+			 * channels 149-161 are considered a bundle while
+			 * channel 165 is left out and considered to be in a
+			 * different bundle. Thus, in case that there is a
+			 * station interface connected to an AP on channel 165,
+			 * it is assumed that channels 149-161 are allowed for
+			 * GO operations. However, having a station interface
+			 * connected to an AP on channels 149-161, does not
+			 * allow GO operation on channel 165.
+			 */
+			if (chan->center_freq == 5825 &&
+			    other_chan->center_freq != 5825)
+				continue;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * Check if the channel can be used under permissive conditions mandated by
  * some regulatory bodies, i.e., the channel is marked with
@@ -1219,59 +1319,14 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 	 * the current registered device.
 	 */
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-		struct ieee80211_channel *other_chan = NULL;
-		int r1, r2;
+		bool ret;
 
 		wdev_lock(wdev);
-		if (wdev->iftype == NL80211_IFTYPE_STATION &&
-		    wdev->current_bss)
-			other_chan = wdev->current_bss->pub.channel;
-
-		/*
-		 * If a GO already operates on the same GO_CONCURRENT channel,
-		 * this one (maybe the same one) can beacon as well. We allow
-		 * the operation even if the station we relied on with
-		 * GO_CONCURRENT is disconnected now. But then we must make sure
-		 * we're not outdoor on an indoor-only channel.
-		 */
-		if (iftype == NL80211_IFTYPE_P2P_GO &&
-		    wdev->iftype == NL80211_IFTYPE_P2P_GO &&
-		    wdev->beacon_interval &&
-		    !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
-			other_chan = wdev->chandef.chan;
+		ret = cfg80211_ir_permissive_check_wdev(iftype, wdev, chan);
 		wdev_unlock(wdev);
 
-		if (!other_chan)
-			continue;
-
-		if (chan == other_chan)
-			return true;
-
-		if (chan->band != NL80211_BAND_5GHZ &&
-		    chan->band != NL80211_BAND_6GHZ)
-			continue;
-
-		r1 = cfg80211_get_unii(chan->center_freq);
-		r2 = cfg80211_get_unii(other_chan->center_freq);
-
-		if (r1 != -EINVAL && r1 == r2) {
-			/*
-			 * At some locations channels 149-165 are considered a
-			 * bundle, but at other locations, e.g., Indonesia,
-			 * channels 149-161 are considered a bundle while
-			 * channel 165 is left out and considered to be in a
-			 * different bundle. Thus, in case that there is a
-			 * station interface connected to an AP on channel 165,
-			 * it is assumed that channels 149-161 are allowed for
-			 * GO operations. However, having a station interface
-			 * connected to an AP on channels 149-161, does not
-			 * allow GO operation on channel 165.
-			 */
-			if (chan->center_freq == 5825 &&
-			    other_chan->center_freq != 5825)
-				continue;
-			return true;
-		}
+		if (ret)
+			return ret;
 	}
 
 	return false;
@@ -1374,3 +1429,24 @@ bool cfg80211_any_usable_channels(struct wiphy *wiphy,
 	return false;
 }
 EXPORT_SYMBOL(cfg80211_any_usable_channels);
+
+struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
+				       unsigned int link_id)
+{
+	ASSERT_WDEV_LOCK(wdev);
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_MESH_POINT:
+		return &wdev->u.mesh.chandef;
+	case NL80211_IFTYPE_ADHOC:
+		return &wdev->u.ibss.chandef;
+	case NL80211_IFTYPE_OCB:
+		return &wdev->u.ocb.chandef;
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		return &wdev->links[link_id].ap.chandef;
+	default:
+		return NULL;
+	}
+}
+EXPORT_SYMBOL(wdev_chandef);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index f08d4b3bb148..3e5d12040726 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1118,6 +1118,7 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev,
 				      bool unregister_netdev)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	unsigned int link_id;
 
 	ASSERT_RTNL();
 	lockdep_assert_held(&rdev->wiphy.mtx);
@@ -1167,11 +1168,22 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev,
 	 */
 	cfg80211_process_wdev_events(wdev);
 
-	if (WARN_ON(wdev->current_bss)) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
+	if (wdev->iftype == NL80211_IFTYPE_STATION ||
+	    wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) {
+		for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) {
+			struct cfg80211_internal_bss *curbss;
+
+			curbss = wdev->links[link_id].client.current_bss;
+
+			if (WARN_ON(curbss)) {
+				cfg80211_unhold_bss(curbss);
+				cfg80211_put_bss(wdev->wiphy, &curbss->pub);
+				wdev->links[link_id].client.current_bss = NULL;
+			}
+		}
 	}
+
+	wdev->connected = false;
 }
 
 void cfg80211_unregister_wdev(struct wireless_dev *wdev)
@@ -1233,7 +1245,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 		break;
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		__cfg80211_stop_ap(rdev, dev, true);
+		__cfg80211_stop_ap(rdev, dev, -1, true);
 		break;
 	case NL80211_IFTYPE_OCB:
 		__cfg80211_leave_ocb(rdev, dev);
@@ -1463,9 +1475,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 				memcpy(&setup, &default_mesh_setup,
 						sizeof(setup));
 				 /* back compat only needed for mesh_id */
-				setup.mesh_id = wdev->ssid;
-				setup.mesh_id_len = wdev->mesh_id_up_len;
-				if (wdev->mesh_id_up_len)
+				setup.mesh_id = wdev->u.mesh.id;
+				setup.mesh_id_len = wdev->u.mesh.id_up_len;
+				if (wdev->u.mesh.id_up_len)
 					__cfg80211_join_mesh(rdev, dev,
 							&setup,
 							&default_mesh_config);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 5436ada91b1a..2c195067ddff 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -307,6 +307,7 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
 void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                       unsigned long age_secs);
 void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+				     unsigned int link,
 				     struct ieee80211_channel *channel);
 
 /* IBSS */
@@ -353,9 +354,11 @@ int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 
 /* AP */
 int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev, bool notify);
+		       struct net_device *dev, int link,
+		       bool notify);
 int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
-		     struct net_device *dev, bool notify);
+		     struct net_device *dev, int link,
+		     bool notify);
 
 /* MLME */
 int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
@@ -507,7 +510,11 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
 bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev);
 
 bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
-			  struct ieee80211_channel *chan);
+			  struct ieee80211_channel *chan,
+			  bool primary_only);
+bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
+			       struct ieee80211_channel *chan,
+			       bool primary_only);
 
 static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
 {
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 5d89eec2869a..4935f94d1acc 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -28,7 +28,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
 		return;
 
-	if (!wdev->ssid_len)
+	if (!wdev->u.ibss.ssid_len)
 		return;
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0,
@@ -37,13 +37,13 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
 	if (WARN_ON(!bss))
 		return;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+	if (wdev->u.ibss.current_bss) {
+		cfg80211_unhold_bss(wdev->u.ibss.current_bss);
+		cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub);
 	}
 
 	cfg80211_hold_bss(bss_from_pub(bss));
-	wdev->current_bss = bss_from_pub(bss);
+	wdev->u.ibss.current_bss = bss_from_pub(bss);
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
@@ -96,7 +96,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	lockdep_assert_held(&rdev->wiphy.mtx);
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		return -EALREADY;
 
 	if (!params->basic_rates) {
@@ -131,7 +131,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 		kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = connkeys;
 
-	wdev->chandef = params->chandef;
+	wdev->u.ibss.chandef = params->chandef;
 	if (connkeys) {
 		params->wep_keys = connkeys->params;
 		params->wep_tx_key = connkeys->def;
@@ -146,8 +146,8 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 		return err;
 	}
 
-	memcpy(wdev->ssid, params->ssid, params->ssid_len);
-	wdev->ssid_len = params->ssid_len;
+	memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len);
+	wdev->u.ibss.ssid_len = params->ssid_len;
 
 	return 0;
 }
@@ -173,14 +173,14 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 		for (i = 0; i < 6; i++)
 			rdev_del_key(rdev, dev, i, false, NULL);
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+	if (wdev->u.ibss.current_bss) {
+		cfg80211_unhold_bss(wdev->u.ibss.current_bss);
+		cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub);
 	}
 
-	wdev->current_bss = NULL;
-	wdev->ssid_len = 0;
-	memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+	wdev->u.ibss.current_bss = NULL;
+	wdev->u.ibss.ssid_len = 0;
+	memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef));
 #ifdef CONFIG_CFG80211_WEXT
 	if (!nowext)
 		wdev->wext.ibss.ssid_len = 0;
@@ -205,7 +205,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->ssid_len)
+	if (!wdev->u.ibss.ssid_len)
 		return -ENOLINK;
 
 	err = rdev_leave_ibss(rdev, dev);
@@ -339,7 +339,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -374,8 +374,8 @@ int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		chan = wdev->current_bss->pub.channel;
+	if (wdev->u.ibss.current_bss)
+		chan = wdev->u.ibss.current_bss->pub.channel;
 	else if (wdev->wext.ibss.chandef.chan)
 		chan = wdev->wext.ibss.chandef.chan;
 	wdev_unlock(wdev);
@@ -408,7 +408,7 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -419,8 +419,8 @@ int cfg80211_ibss_wext_siwessid(struct net_device *dev,
 	if (len > 0 && ssid[len - 1] == '\0')
 		len--;
 
-	memcpy(wdev->ssid, ssid, len);
-	wdev->wext.ibss.ssid = wdev->ssid;
+	memcpy(wdev->u.ibss.ssid, ssid, len);
+	wdev->wext.ibss.ssid = wdev->u.ibss.ssid;
 	wdev->wext.ibss.ssid_len = len;
 
 	wdev_lock(wdev);
@@ -443,10 +443,10 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
 	data->flags = 0;
 
 	wdev_lock(wdev);
-	if (wdev->ssid_len) {
+	if (wdev->u.ibss.ssid_len) {
 		data->flags = 1;
-		data->length = wdev->ssid_len;
-		memcpy(ssid, wdev->ssid, data->length);
+		data->length = wdev->u.ibss.ssid_len;
+		memcpy(ssid, wdev->u.ibss.ssid, data->length);
 	} else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) {
 		data->flags = 1;
 		data->length = wdev->wext.ibss.ssid_len;
@@ -494,7 +494,7 @@ int cfg80211_ibss_wext_siwap(struct net_device *dev,
 
 	wdev_lock(wdev);
 	err = 0;
-	if (wdev->ssid_len)
+	if (wdev->u.ibss.ssid_len)
 		err = __cfg80211_leave_ibss(rdev, dev, true);
 	wdev_unlock(wdev);
 
@@ -527,8 +527,9 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev,
 	ap_addr->sa_family = ARPHRD_ETHER;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (wdev->u.ibss.current_bss)
+		memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else if (wdev->wext.ibss.bssid)
 		memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN);
 	else
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index e4e363138279..59a3c5c092b1 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * Portions
+ * Copyright (C) 2022 Intel Corporation
+ */
 #include <linux/ieee80211.h>
 #include <linux/export.h>
 #include <net/cfg80211.h>
@@ -114,7 +118,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 	      setup->is_secure)
 		return -EOPNOTSUPP;
 
-	if (wdev->mesh_id_len)
+	if (wdev->u.mesh.id_len)
 		return -EALREADY;
 
 	if (!setup->mesh_id_len)
@@ -125,7 +129,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 
 	if (!setup->chandef.chan) {
 		/* if no channel explicitly given, use preset channel */
-		setup->chandef = wdev->preset_chandef;
+		setup->chandef = wdev->u.mesh.preset_chandef;
 	}
 
 	if (!setup->chandef.chan) {
@@ -209,10 +213,10 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 
 	err = rdev_join_mesh(rdev, dev, conf, setup);
 	if (!err) {
-		memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
-		wdev->mesh_id_len = setup->mesh_id_len;
-		wdev->chandef = setup->chandef;
-		wdev->beacon_interval = setup->beacon_interval;
+		memcpy(wdev->u.mesh.id, setup->mesh_id, setup->mesh_id_len);
+		wdev->u.mesh.id_len = setup->mesh_id_len;
+		wdev->u.mesh.chandef = setup->chandef;
+		wdev->u.mesh.beacon_interval = setup->beacon_interval;
 	}
 
 	return err;
@@ -241,15 +245,15 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
 		err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev,
 						     chandef->chan);
 		if (!err)
-			wdev->chandef = *chandef;
+			wdev->u.mesh.chandef = *chandef;
 
 		return err;
 	}
 
-	if (wdev->mesh_id_len)
+	if (wdev->u.mesh.id_len)
 		return -EBUSY;
 
-	wdev->preset_chandef = *chandef;
+	wdev->u.mesh.preset_chandef = *chandef;
 	return 0;
 }
 
@@ -267,15 +271,16 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 	if (!rdev->ops->leave_mesh)
 		return -EOPNOTSUPP;
 
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		return -ENOTCONN;
 
 	err = rdev_leave_mesh(rdev, dev);
 	if (!err) {
 		wdev->conn_owner_nlportid = 0;
-		wdev->mesh_id_len = 0;
-		wdev->beacon_interval = 0;
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+		wdev->u.mesh.id_len = 0;
+		wdev->u.mesh.beacon_interval = 0;
+		memset(&wdev->u.mesh.chandef, 0,
+		       sizeof(wdev->u.mesh.chandef));
 		rdev_set_qos_map(rdev, dev, NULL);
 		cfg80211_sched_dfs_chan_update(rdev);
 	}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index c8155a483ec2..fab2d6206cdd 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -92,8 +92,7 @@ static void cfg80211_process_deauth(struct wireless_dev *wdev,
 
 	nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL);
 
-	if (!wdev->current_bss ||
-	    !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
+	if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))
 		return;
 
 	__cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
@@ -113,8 +112,8 @@ static void cfg80211_process_disassoc(struct wireless_dev *wdev,
 	nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect,
 			      GFP_KERNEL);
 
-	if (WARN_ON(!wdev->current_bss ||
-		    !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	if (WARN_ON(!wdev->connected ||
+		    !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return;
 
 	__cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
@@ -260,8 +259,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 		if (!key || !key_len || key_idx < 0 || key_idx > 3)
 			return -EINVAL;
 
-	if (wdev->current_bss &&
-	    ether_addr_equal(bssid, wdev->current_bss->pub.bssid))
+	if (wdev->connected &&
+	    ether_addr_equal(bssid, wdev->u.client.connected_addr))
 		return -EALREADY;
 
 	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
@@ -322,9 +321,9 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (wdev->current_bss &&
-	    (!req->prev_bssid || !ether_addr_equal(wdev->current_bss->pub.bssid,
-						   req->prev_bssid)))
+	if (wdev->connected &&
+	    (!req->prev_bssid ||
+	     !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
 		return -EALREADY;
 
 	cfg80211_oper_and_ht_capa(&req->ht_capa_mask,
@@ -364,13 +363,13 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (local_state_change &&
-	    (!wdev->current_bss ||
-	     !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	    (!wdev->connected ||
+	     !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return 0;
 
 	if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
-	    (wdev->current_bss &&
-	     ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	    (wdev->connected &&
+	     ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		wdev->conn_owner_nlportid = 0;
 
 	return rdev_deauth(rdev, dev, &req);
@@ -392,11 +391,12 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		return -ENOTCONN;
 
-	if (ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
-		req.bss = &wdev->current_bss->pub;
+	if (ether_addr_equal(wdev->links[0].client.current_bss->pub.bssid,
+			     bssid))
+		req.bss = &wdev->links[0].client.current_bss->pub;
 	else
 		return -ENOTCONN;
 
@@ -405,7 +405,7 @@ int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
 		return err;
 
 	/* driver should have reported the disassoc */
-	WARN_ON(wdev->current_bss);
+	WARN_ON(wdev->connected);
 	return 0;
 }
 
@@ -420,10 +420,10 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
 	if (!rdev->ops->deauth)
 		return;
 
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		return;
 
-	memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+	memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN);
 	cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0,
 			     WLAN_REASON_DEAUTH_LEAVING, false);
 }
@@ -676,28 +676,34 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 
 		switch (wdev->iftype) {
 		case NL80211_IFTYPE_ADHOC:
+			/*
+			 * check for IBSS DA must be done by driver as
+			 * cfg80211 doesn't track the stations
+			 */
+			if (!wdev->u.ibss.current_bss ||
+			    !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid,
+					      mgmt->bssid)) {
+				err = -ENOTCONN;
+				break;
+			}
+			break;
 		case NL80211_IFTYPE_STATION:
 		case NL80211_IFTYPE_P2P_CLIENT:
-			if (!wdev->current_bss) {
+			if (!wdev->connected) {
 				err = -ENOTCONN;
 				break;
 			}
 
-			if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+			/* FIXME: MLD may address this differently */
+
+			if (!ether_addr_equal(wdev->u.client.connected_addr,
 					      mgmt->bssid)) {
 				err = -ENOTCONN;
 				break;
 			}
 
-			/*
-			 * check for IBSS DA must be done by driver as
-			 * cfg80211 doesn't track the stations
-			 */
-			if (wdev->iftype == NL80211_IFTYPE_ADHOC)
-				break;
-
 			/* for station, check that DA is the AP */
-			if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+			if (!ether_addr_equal(wdev->u.client.connected_addr,
 					      mgmt->da)) {
 				err = -ENOTCONN;
 				break;
@@ -743,12 +749,12 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 		if (!ieee80211_is_action(mgmt->frame_control) ||
 		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
 			return -EINVAL;
-		if (!wdev->current_bss &&
+		if (!wdev->connected &&
 		    !wiphy_ext_feature_isset(
 			    &rdev->wiphy,
 			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
 			return -EINVAL;
-		if (wdev->current_bss &&
+		if (wdev->connected &&
 		    !wiphy_ext_feature_isset(
 			    &rdev->wiphy,
 			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
@@ -940,12 +946,16 @@ void cfg80211_cac_event(struct net_device *netdev,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	unsigned long timeout;
 
+	/* not yet supported */
+	if (wdev->valid_links)
+		return;
+
 	trace_cfg80211_cac_event(netdev, event);
 
 	if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED))
 		return;
 
-	if (WARN_ON(!wdev->chandef.chan))
+	if (WARN_ON(!wdev->links[0].ap.chandef.chan))
 		return;
 
 	switch (event) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 740b29481bc6..af31978fc9cc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -792,6 +792,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 				 NL80211_EHT_MIN_CAPABILITY_LEN,
 				 NL80211_EHT_MAX_CAPABILITY_LEN),
 	[NL80211_ATTR_DISABLE_EHT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_MLO_LINKS] =
+		NLA_POLICY_NESTED_ARRAY(nl80211_policy),
+	[NL80211_ATTR_MLO_LINK_ID] =
+		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 };
 
 /* policy for the key attributes */
@@ -1225,6 +1229,37 @@ static bool nl80211_put_txq_stats(struct sk_buff *msg,
 
 /* netlink command implementations */
 
+/**
+ * nl80211_link_id - return link ID
+ * @attrs: attributes to look at
+ *
+ * Returns: the link ID or 0 if not given
+ *
+ * Note this function doesn't do any validation of the link
+ * ID validity wrt. links that were actually added, so it must
+ * be called only from ops with %NL80211_FLAG_MLO_VALID_LINK_ID
+ * or if additional validation is done.
+ */
+static unsigned int nl80211_link_id(struct nlattr **attrs)
+{
+	struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];
+
+	if (!linkid)
+		return 0;
+
+	return nla_get_u8(linkid);
+}
+
+static int nl80211_link_id_or_invalid(struct nlattr **attrs)
+{
+	struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];
+
+	if (!linkid)
+		return -1;
+
+	return nla_get_u8(linkid);
+}
+
 struct key_parse {
 	struct key_params p;
 	int idx;
@@ -1496,11 +1531,15 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 	case NL80211_IFTYPE_MESH_POINT:
 		break;
 	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.current_bss)
+			return 0;
+		return -ENOLINK;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (!wdev->current_bss)
-			return -ENOLINK;
-		break;
+		/* for MLO, require driver validation of the link ID */
+		if (wdev->connected)
+			return 0;
+		return -ENOLINK;
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NL80211_IFTYPE_OCB:
 	case NL80211_IFTYPE_MONITOR:
@@ -3232,12 +3271,14 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 
 static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 				 struct net_device *dev,
-				 struct genl_info *info)
+				 struct genl_info *info,
+				 int _link_id)
 {
 	struct cfg80211_chan_def chandef;
 	int result;
 	enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR;
 	struct wireless_dev *wdev = NULL;
+	int link_id = _link_id;
 
 	if (dev)
 		wdev = dev->ieee80211_ptr;
@@ -3246,6 +3287,12 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	if (wdev)
 		iftype = wdev->iftype;
 
+	if (link_id < 0) {
+		if (wdev && wdev->valid_links)
+			return -EINVAL;
+		link_id = 0;
+	}
+
 	result = nl80211_parse_chandef(rdev, info, &chandef);
 	if (result)
 		return result;
@@ -3254,49 +3301,48 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
 		if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
-						   iftype)) {
-			result = -EINVAL;
-			break;
-		}
-		if (wdev->beacon_interval) {
+						   iftype))
+			return -EINVAL;
+		if (wdev->links[link_id].ap.beacon_interval) {
+			struct ieee80211_channel *cur_chan;
+
 			if (!dev || !rdev->ops->set_ap_chanwidth ||
 			    !(rdev->wiphy.features &
-			      NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) {
-				result = -EBUSY;
-				break;
-			}
+			      NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE))
+				return -EBUSY;
 
 			/* Only allow dynamic channel width changes */
-			if (chandef.chan != wdev->preset_chandef.chan) {
-				result = -EBUSY;
-				break;
-			}
-			result = rdev_set_ap_chanwidth(rdev, dev, &chandef);
+			cur_chan = wdev->links[link_id].ap.chandef.chan;
+			if (chandef.chan != cur_chan)
+				return -EBUSY;
+
+			result = rdev_set_ap_chanwidth(rdev, dev, link_id,
+						       &chandef);
 			if (result)
-				break;
+				return result;
+			wdev->links[link_id].ap.chandef = chandef;
+		} else {
+			wdev->u.ap.preset_chandef = chandef;
 		}
-		wdev->preset_chandef = chandef;
-		result = 0;
-		break;
+		return 0;
 	case NL80211_IFTYPE_MESH_POINT:
-		result = cfg80211_set_mesh_channel(rdev, wdev, &chandef);
-		break;
+		return cfg80211_set_mesh_channel(rdev, wdev, &chandef);
 	case NL80211_IFTYPE_MONITOR:
-		result = cfg80211_set_monitor_channel(rdev, &chandef);
-		break;
+		return cfg80211_set_monitor_channel(rdev, &chandef);
 	default:
-		result = -EINVAL;
+		break;
 	}
 
-	return result;
+	return -EINVAL;
 }
 
 static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
 	struct net_device *netdev = info->user_ptr[1];
 
-	return __nl80211_set_channel(rdev, netdev, info);
+	return __nl80211_set_channel(rdev, netdev, info, link_id);
 }
 
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
@@ -3411,7 +3457,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		result = __nl80211_set_channel(
 			rdev,
 			nl80211_can_set_dev_channel(wdev) ? netdev : NULL,
-			info);
+			info, -1);
 		if (result)
 			goto out;
 	}
@@ -3696,15 +3742,13 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	    nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
 		goto nla_put_failure;
 
-	if (rdev->ops->get_channel) {
-		int ret;
+	if (rdev->ops->get_channel && !wdev->valid_links) {
 		struct cfg80211_chan_def chandef = {};
+		int ret;
 
-		ret = rdev_get_channel(rdev, wdev, &chandef);
-		if (ret == 0) {
-			if (nl80211_send_chandef(msg, &chandef))
-				goto nla_put_failure;
-		}
+		ret = rdev_get_channel(rdev, wdev, 0, &chandef);
+		if (ret == 0 && nl80211_send_chandef(msg, &chandef))
+			goto nla_put_failure;
 	}
 
 	if (rdev->ops->get_tx_power) {
@@ -3721,27 +3765,24 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		if (wdev->ssid_len &&
-		    nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid))
+		if (wdev->u.ap.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len,
+			    wdev->u.ap.ssid))
 			goto nla_put_failure_locked;
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-	case NL80211_IFTYPE_ADHOC: {
-		const struct element *ssid_elem;
-
-		if (!wdev->current_bss)
-			break;
-		rcu_read_lock();
-		ssid_elem = ieee80211_bss_get_elem(&wdev->current_bss->pub,
-						   WLAN_EID_SSID);
-		if (ssid_elem &&
-		    nla_put(msg, NL80211_ATTR_SSID, ssid_elem->datalen,
-			    ssid_elem->data))
-			goto nla_put_failure_rcu_locked;
-		rcu_read_unlock();
+		if (wdev->u.client.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.client.ssid_len,
+			    wdev->u.client.ssid))
+			goto nla_put_failure_locked;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.ssid_len &&
+		    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ibss.ssid_len,
+			    wdev->u.ibss.ssid))
+			goto nla_put_failure_locked;
 		break;
-		}
 	default:
 		/* nothing */
 		break;
@@ -3761,8 +3802,6 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	genlmsg_end(msg, hdr);
 	return 0;
 
- nla_put_failure_rcu_locked:
-	rcu_read_unlock();
  nla_put_failure_locked:
 	wdev_unlock(wdev);
  nla_put_failure:
@@ -4014,10 +4053,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
-		wdev->mesh_id_up_len =
+		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
-		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
-		       wdev->mesh_id_up_len);
+		memcpy(wdev->u.mesh.id,
+		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->u.mesh.id_up_len);
 		wdev_unlock(wdev);
 	}
 
@@ -4122,10 +4162,11 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		wdev_lock(wdev);
 		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
 			     IEEE80211_MAX_MESH_ID_LEN);
-		wdev->mesh_id_up_len =
+		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
-		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
-		       wdev->mesh_id_up_len);
+		memcpy(wdev->u.mesh.id,
+		       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->u.mesh.id_up_len);
 		wdev_unlock(wdev);
 		break;
 	case NL80211_IFTYPE_NAN:
@@ -4662,7 +4703,7 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
 		return -EOPNOTSUPP;
 
-	if (!dev->ieee80211_ptr->beacon_interval)
+	if (!dev->ieee80211_ptr->links[0].ap.beacon_interval)
 		return -EINVAL;
 
 	acl = parse_acl_data(&rdev->wiphy, info);
@@ -4818,14 +4859,24 @@ static void he_build_mcs_mask(u16 he_mcs_map,
 	}
 }
 
-static u16 he_get_txmcsmap(struct genl_info *info,
+static u16 he_get_txmcsmap(struct genl_info *info, unsigned int link_id,
 			   const struct ieee80211_sta_he_cap *he_cap)
 {
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	__le16	tx_mcs;
+	struct cfg80211_chan_def *chandef;
+	__le16 tx_mcs;
 
-	switch (wdev->chandef.width) {
+	chandef = wdev_chandef(wdev, link_id);
+	if (!chandef) {
+		/*
+		 * This is probably broken, but we never maintained
+		 * a chandef in these cases, so it always was.
+		 */
+		return le16_to_cpu(he_cap->he_mcs_nss_supp.tx_mcs_80);
+	}
+
+	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_80P80:
 		tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80;
 		break;
@@ -4836,6 +4887,7 @@ static u16 he_get_txmcsmap(struct genl_info *info,
 		tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80;
 		break;
 	}
+
 	return le16_to_cpu(tx_mcs);
 }
 
@@ -4843,7 +4895,8 @@ static bool he_set_mcs_mask(struct genl_info *info,
 			    struct wireless_dev *wdev,
 			    struct ieee80211_supported_band *sband,
 			    struct nl80211_txrate_he *txrate,
-			    u16 mcs[NL80211_HE_NSS_MAX])
+			    u16 mcs[NL80211_HE_NSS_MAX],
+			    unsigned int link_id)
 {
 	const struct ieee80211_sta_he_cap *he_cap;
 	u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {};
@@ -4856,7 +4909,7 @@ static bool he_set_mcs_mask(struct genl_info *info,
 
 	memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX);
 
-	tx_mcs_map = he_get_txmcsmap(info, he_cap);
+	tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);
 
 	/* Build he_mcs_mask from HE capabilities */
 	he_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
@@ -4876,7 +4929,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 					 enum nl80211_attrs attr,
 					 struct cfg80211_bitrate_mask *mask,
 					 struct net_device *dev,
-					 bool default_all_enabled)
+					 bool default_all_enabled,
+					 unsigned int link_id)
 {
 	struct nlattr *tb[NL80211_TXRATE_MAX + 1];
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -4913,7 +4967,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 		if (!he_cap)
 			continue;
 
-		he_tx_mcs_map = he_get_txmcsmap(info, he_cap);
+		he_tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);
 		he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs);
 
 		mask->control[i].he_gi = 0xFF;
@@ -4978,7 +5032,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
 		if (tb[NL80211_TXRATE_HE] &&
 		    !he_set_mcs_mask(info, wdev, sband,
 				     nla_data(tb[NL80211_TXRATE_HE]),
-				     mask->control[band].he_mcs))
+				     mask->control[band].he_mcs,
+				     link_id))
 			return -EINVAL;
 
 		if (tb[NL80211_TXRATE_HE_GI])
@@ -5215,6 +5270,8 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 
 	memset(bcn, 0, sizeof(*bcn));
 
+	bcn->link_id = nl80211_link_id(attrs);
+
 	if (attrs[NL80211_ATTR_BEACON_HEAD]) {
 		bcn->head = nla_data(attrs[NL80211_ATTR_BEACON_HEAD]);
 		bcn->head_len = nla_len(attrs[NL80211_ATTR_BEACON_HEAD]);
@@ -5468,22 +5525,20 @@ static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_ap_settings *params)
 {
 	struct wireless_dev *wdev;
-	bool ret = false;
 
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 		if (wdev->iftype != NL80211_IFTYPE_AP &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_GO)
 			continue;
 
-		if (!wdev->preset_chandef.chan)
+		if (!wdev->u.ap.preset_chandef.chan)
 			continue;
 
-		params->chandef = wdev->preset_chandef;
-		ret = true;
-		break;
+		params->chandef = wdev->u.ap.preset_chandef;
+		return true;
 	}
 
-	return ret;
+	return false;
 }
 
 static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
@@ -5541,6 +5596,7 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_ap_settings *params;
@@ -5553,7 +5609,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->start_ap)
 		return -EOPNOTSUPP;
 
-	if (wdev->beacon_interval)
+	if (wdev->links[link_id].ap.beacon_interval)
 		return -EALREADY;
 
 	/* these are required for START_AP */
@@ -5595,6 +5651,18 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			err = -EINVAL;
 			goto out;
 		}
+
+		if (wdev->u.ap.ssid_len &&
+		    (wdev->u.ap.ssid_len != params->ssid_len ||
+		     memcmp(wdev->u.ap.ssid, params->ssid, params->ssid_len))) {
+			/* require identical SSID for MLO */
+			err = -EINVAL;
+			goto out;
+		}
+	} else if (wdev->valid_links) {
+		/* require SSID for MLO */
+		err = -EINVAL;
+		goto out;
 	}
 
 	if (info->attrs[NL80211_ATTR_HIDDEN_SSID])
@@ -5662,8 +5730,12 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_chandef(rdev, info, &params->chandef);
 		if (err)
 			goto out;
-	} else if (wdev->preset_chandef.chan) {
-		params->chandef = wdev->preset_chandef;
+	} else if (wdev->valid_links) {
+		/* with MLD need to specify the channel configuration */
+		err = -EINVAL;
+		goto out;
+	} else if (wdev->u.ap.preset_chandef.chan) {
+		params->chandef = wdev->u.ap.preset_chandef;
 	} else if (!nl80211_get_ap_channel(rdev, params)) {
 		err = -EINVAL;
 		goto out;
@@ -5679,7 +5751,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 						    NL80211_ATTR_TX_RATES,
 						    &params->beacon_rate,
-						    dev, false);
+						    dev, false, link_id);
 		if (err)
 			goto out;
 
@@ -5779,19 +5851,28 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		params->flags |= NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT;
 
 	wdev_lock(wdev);
+	if (wdev->conn_owner_nlportid &&
+	    info->attrs[NL80211_ATTR_SOCKET_OWNER] &&
+	    wdev->conn_owner_nlportid != info->snd_portid) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* FIXME: validate MLO/link-id against driver capabilities */
+
 	err = rdev_start_ap(rdev, dev, params);
 	if (!err) {
-		wdev->preset_chandef = params->chandef;
-		wdev->beacon_interval = params->beacon_interval;
-		wdev->chandef = params->chandef;
-		wdev->ssid_len = params->ssid_len;
-		memcpy(wdev->ssid, params->ssid, wdev->ssid_len);
+		wdev->links[link_id].ap.beacon_interval = params->beacon_interval;
+		wdev->links[link_id].ap.chandef = params->chandef;
+		wdev->u.ap.ssid_len = params->ssid_len;
+		memcpy(wdev->u.ap.ssid, params->ssid,
+		       params->ssid_len);
 
 		if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
 			wdev->conn_owner_nlportid = info->snd_portid;
 	}
+out_unlock:
 	wdev_unlock(wdev);
-
 out:
 	kfree(params->acl);
 	kfree(params->beacon.mbssid_ies);
@@ -5807,6 +5888,7 @@ out:
 static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_beacon_data params;
@@ -5819,7 +5901,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->change_beacon)
 		return -EOPNOTSUPP;
 
-	if (!wdev->beacon_interval)
+	if (!wdev->links[link_id].ap.beacon_interval)
 		return -EINVAL;
 
 	err = nl80211_parse_beacon(rdev, info->attrs, &params);
@@ -5838,9 +5920,10 @@ out:
 static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 
-	return cfg80211_stop_ap(rdev, dev, false);
+	return cfg80211_stop_ap(rdev, dev, link_id, false);
 }
 
 static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
@@ -7590,7 +7673,7 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 
 	wdev_lock(wdev);
 	/* If not connected, get default parameters */
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
 	else
 		err = rdev_get_mesh_config(rdev, dev, &cur_params);
@@ -7971,7 +8054,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb,
 		return err;
 
 	wdev_lock(wdev);
-	if (!wdev->mesh_id_len)
+	if (!wdev->u.mesh.id_len)
 		err = -ENOLINK;
 
 	if (!err)
@@ -8463,14 +8546,44 @@ int nl80211_parse_random_mac(struct nlattr **attrs,
 	return 0;
 }
 
-static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
+static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev,
+					      struct ieee80211_channel *chan)
 {
+	unsigned int link_id;
+	bool all_ok = true;
+
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (!cfg80211_beaconing_iface_active(wdev))
 		return true;
 
-	if (!(wdev->chandef.chan->flags & IEEE80211_CHAN_RADAR))
+	/*
+	 * FIXME: check if we have a free HW resource/link for chan
+	 *
+	 * This, as well as the FIXME below, requires knowing the link
+	 * capabilities of the hardware.
+	 */
+
+	/* we cannot leave radar channels */
+	for_each_valid_link(wdev, link_id) {
+		struct cfg80211_chan_def *chandef;
+
+		chandef = wdev_chandef(wdev, link_id);
+		if (!chandef)
+			continue;
+
+		/*
+		 * FIXME: don't require all_ok, but rather check only the
+		 *	  correct HW resource/link onto which 'chan' falls,
+		 *	  as only that link leaves the channel for doing
+		 *	  the off-channel operation.
+		 */
+
+		if (chandef->chan->flags & IEEE80211_CHAN_RADAR)
+			all_ok = false;
+	}
+
+	if (all_ok)
 		return true;
 
 	return regulatory_pre_cac_allowed(wdev->wiphy);
@@ -8553,7 +8666,7 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
 		int err;
 
 		if (!(wiphy->features & randomness_flag) ||
-		    (wdev && wdev->current_bss))
+		    (wdev && wdev->connected))
 			return -EOPNOTSUPP;
 
 		err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask);
@@ -8690,17 +8803,14 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	request->n_channels = i;
 
 	wdev_lock(wdev);
-	if (!cfg80211_off_channel_oper_allowed(wdev)) {
-		struct ieee80211_channel *chan;
+	for (i = 0; i < request->n_channels; i++) {
+		struct ieee80211_channel *chan = request->channels[i];
 
-		if (request->n_channels != 1) {
-			wdev_unlock(wdev);
-			err = -EBUSY;
-			goto out_free;
-		}
+		/* if we can go off-channel to the target channel we're good */
+		if (cfg80211_off_channel_oper_allowed(wdev, chan))
+			continue;
 
-		chan = request->channels[0];
-		if (chan->center_freq != wdev->chandef.chan->center_freq) {
+		if (!cfg80211_wdev_on_sub_chan(wdev, chan, true)) {
 			wdev_unlock(wdev);
 			err = -EBUSY;
 			goto out_free;
@@ -9445,7 +9555,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 
 	err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms);
 	if (!err) {
-		wdev->chandef = chandef;
+		wdev->links[0].ap.chandef = chandef;
 		wdev->cac_started = true;
 		wdev->cac_start_time = jiffies;
 		wdev->cac_time_ms = cac_time_ms;
@@ -9513,6 +9623,7 @@ static int nl80211_notify_radar_detection(struct sk_buff *skb,
 static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_csa_settings params;
@@ -9539,15 +9650,15 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 		need_handle_dfs_flag = false;
 
 		/* useless if AP is not running */
-		if (!wdev->beacon_interval)
+		if (!wdev->links[link_id].ap.beacon_interval)
 			return -ENOTCONN;
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		if (!wdev->ssid_len)
+		if (!wdev->u.ibss.ssid_len)
 			return -ENOTCONN;
 		break;
 	case NL80211_IFTYPE_MESH_POINT:
-		if (!wdev->mesh_id_len)
+		if (!wdev->u.mesh.id_len)
 			return -ENOTCONN;
 		break;
 	default:
@@ -9718,6 +9829,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 {
 	struct cfg80211_bss *res = &intbss->pub;
 	const struct cfg80211_bss_ies *ies;
+	unsigned int link_id;
 	void *hdr;
 	struct nlattr *bss;
 
@@ -9822,13 +9934,15 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
-		if (intbss == wdev->current_bss &&
-		    nla_put_u32(msg, NL80211_BSS_STATUS,
-				NL80211_BSS_STATUS_ASSOCIATED))
-			goto nla_put_failure;
+		for_each_valid_link(wdev, link_id) {
+			if (intbss == wdev->links[link_id].client.current_bss &&
+			    nla_put_u32(msg, NL80211_BSS_STATUS,
+					NL80211_BSS_STATUS_ASSOCIATED))
+				goto nla_put_failure;
+		}
 		break;
 	case NL80211_IFTYPE_ADHOC:
-		if (intbss == wdev->current_bss &&
+		if (intbss == wdev->u.ibss.current_bss &&
 		    nla_put_u32(msg, NL80211_BSS_STATUS,
 				NL80211_BSS_STATUS_IBSS_JOINED))
 			goto nla_put_failure;
@@ -11362,7 +11476,7 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	}
 
 	wdev_lock(dev->ieee80211_ptr);
-	if (!wdev->current_bss)
+	if (!wdev->connected)
 		ret = -ENOLINK;
 	else
 		ret = rdev_update_connect_params(rdev, dev, &connect, changed);
@@ -11575,9 +11689,9 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 				     struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct wireless_dev *wdev = info->user_ptr[1];
 	struct cfg80211_chan_def chandef;
-	const struct cfg80211_chan_def *compat_chandef;
 	struct sk_buff *msg;
 	void *hdr;
 	u64 cookie;
@@ -11607,10 +11721,22 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 		return err;
 
 	wdev_lock(wdev);
-	if (!cfg80211_off_channel_oper_allowed(wdev) &&
-	    !cfg80211_chandef_identical(&wdev->chandef, &chandef)) {
-		compat_chandef = cfg80211_chandef_compatible(&wdev->chandef,
-							     &chandef);
+	if (!cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
+		const struct cfg80211_chan_def *oper_chandef, *compat_chandef;
+
+		oper_chandef = wdev_chandef(wdev, link_id);
+
+		if (WARN_ON(!oper_chandef)) {
+			/* cannot happen since we must beacon to get here */
+			WARN_ON(1);
+			wdev_unlock(wdev);
+			return -EBUSY;
+		}
+
+		/* note: returns first one if identical chandefs */
+		compat_chandef = cfg80211_chandef_compatible(&chandef,
+							     oper_chandef);
+
 		if (compat_chandef != &chandef) {
 			wdev_unlock(wdev);
 			return -EBUSY;
@@ -11672,6 +11798,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 				       struct genl_info *info)
 {
 	struct cfg80211_bitrate_mask mask;
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -11683,11 +11810,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 	wdev_lock(wdev);
 	err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 					    NL80211_ATTR_TX_RATES, &mask,
-					    dev, true);
+					    dev, true, link_id);
 	if (err)
 		goto out;
 
-	err = rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+	err = rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask);
 out:
 	wdev_unlock(wdev);
 	return err;
@@ -11812,7 +11939,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (params.offchan && !cfg80211_off_channel_oper_allowed(wdev)) {
+	if (params.offchan &&
+	    !cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
 		wdev_unlock(wdev);
 		return -EBUSY;
 	}
@@ -12030,12 +12158,13 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 * connection is established and enough beacons received to calculate
 	 * the average.
 	 */
-	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
+	if (!wdev->cqm_config->last_rssi_event_value &&
+	    wdev->links[0].client.current_bss &&
 	    rdev->ops->get_station) {
 		struct station_info sinfo = {};
 		u8 *mac_addr;
 
-		mac_addr = wdev->current_bss->pub.bssid;
+		mac_addr = wdev->links[0].client.current_bss->pub.bssid;
 
 		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
 		if (err)
@@ -12298,7 +12427,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
 						    NL80211_ATTR_TX_RATES,
 						    &setup.beacon_rate,
-						    dev, false);
+						    dev, false, 0);
 		if (err)
 			return err;
 
@@ -13268,7 +13397,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 		rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]);
 
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (!wdev->connected) {
 		err = -ENOTCONN;
 		goto out;
 	}
@@ -14537,7 +14666,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (wdev->current_bss)
+		if (wdev->connected)
 			break;
 		err = -ENOTCONN;
 		goto out;
@@ -14710,13 +14839,13 @@ static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (!wdev->connected) {
 		ret = -ENOTCONN;
 		goto out;
 	}
 
 	pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
-	if (memcmp(pmk_conf.aa, wdev->current_bss->pub.bssid, ETH_ALEN)) {
+	if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -14844,9 +14973,13 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_MESH_POINT:
 		break;
 	case NL80211_IFTYPE_ADHOC:
+		if (wdev->u.ibss.current_bss)
+			break;
+		err = -ENOTCONN;
+		goto out;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		if (wdev->current_bss)
+		if (wdev->connected)
 			break;
 		err = -ENOTCONN;
 		goto out;
@@ -14882,12 +15015,14 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_ftm_responder_stats ftm_stats = {};
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct sk_buff *msg;
 	void *hdr;
 	struct nlattr *ftm_stats_attr;
 	int err;
 
-	if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval)
+	if (wdev->iftype != NL80211_IFTYPE_AP ||
+	    !wdev->links[link_id].ap.beacon_interval)
 		return -EOPNOTSUPP;
 
 	err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
@@ -15017,7 +15152,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info)
 static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			  struct nlattr *attrs[], struct net_device *dev,
 			  struct cfg80211_tid_cfg *tid_conf,
-			  struct genl_info *info, const u8 *peer)
+			  struct genl_info *info, const u8 *peer,
+			  unsigned int link_id)
 {
 	struct netlink_ext_ack *extack = info->extack;
 	u64 mask;
@@ -15092,7 +15228,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev,
 			attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
 			err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
 						    &tid_conf->txrate_mask, dev,
-						    true);
+						    true, link_id);
 			if (err)
 				return err;
 
@@ -15119,6 +15255,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1];
+	unsigned int link_id = nl80211_link_id(info->attrs);
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_tid_config *tid_config;
 	struct nlattr *tid;
@@ -15156,7 +15293,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb,
 
 		ret = parse_tid_conf(rdev, attrs, dev,
 				     &tid_config->tid_conf[conf_idx],
-				     info, tid_config->peer);
+				     info, tid_config->peer, link_id);
 		if (ret)
 			goto bad_tid_conf;
 
@@ -15295,6 +15432,62 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 	return rdev_set_fils_aad(rdev, dev, &fils_aad);
 }
 
+static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
+{
+	unsigned int link_id = nl80211_link_id(info->attrs);
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (!(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		return -EINVAL;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MAC] ||
+	    !is_valid_ether_addr(nla_data(info->attrs[NL80211_ATTR_MAC])))
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	wdev->valid_links |= BIT(link_id);
+	ether_addr_copy(wdev->links[link_id].addr,
+			nla_data(info->attrs[NL80211_ATTR_MAC]));
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
+static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
+{
+	unsigned int link_id = nl80211_link_id(info->attrs);
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* cannot remove if there's no link */
+	if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* FIXME: stop the link operations first */
+
+	wdev_lock(wdev);
+	wdev->valid_links &= ~BIT(link_id);
+	eth_zero_addr(wdev->links[link_id].addr);
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -15307,6 +15500,8 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 					 NL80211_FLAG_CHECK_NETDEV_UP)
 #define NL80211_FLAG_CLEAR_SKB		0x20
 #define NL80211_FLAG_NO_WIPHY_MTX	0x40
+#define NL80211_FLAG_MLO_VALID_LINK_ID	0x80
+#define NL80211_FLAG_MLO_UNSUPPORTED	0x100
 
 #define INTERNAL_FLAG_SELECTORS(__sel)			\
 	SELECTOR(__sel, NONE, 0) /* must be first */	\
@@ -15316,6 +15511,12 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 		 NL80211_FLAG_NEED_WDEV)		\
 	SELECTOR(__sel, NETDEV,				\
 		 NL80211_FLAG_NEED_NETDEV)		\
+	SELECTOR(__sel, NETDEV_LINK,			\
+		 NL80211_FLAG_NEED_NETDEV |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
+	SELECTOR(__sel, NETDEV_NO_MLO,			\
+		 NL80211_FLAG_NEED_NETDEV |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)	\
 	SELECTOR(__sel, WIPHY_RTNL,			\
 		 NL80211_FLAG_NEED_WIPHY |		\
 		 NL80211_FLAG_NEED_RTNL)		\
@@ -15331,14 +15532,31 @@ static int nl80211_set_fils_aad(struct sk_buff *skb,
 		 NL80211_FLAG_NEED_RTNL)		\
 	SELECTOR(__sel, NETDEV_UP,			\
 		 NL80211_FLAG_NEED_NETDEV_UP)		\
+	SELECTOR(__sel, NETDEV_UP_LINK,			\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
+	SELECTOR(__sel, NETDEV_UP_NO_MLO,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
+	SELECTOR(__sel, NETDEV_UP_NO_MLO_CLEAR,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_CLEAR_SKB |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
 	SELECTOR(__sel, NETDEV_UP_NOTMX,		\
 		 NL80211_FLAG_NEED_NETDEV_UP |		\
 		 NL80211_FLAG_NO_WIPHY_MTX)		\
+	SELECTOR(__sel, NETDEV_UP_NOTMX_NOMLO,		\
+		 NL80211_FLAG_NEED_NETDEV_UP |		\
+		 NL80211_FLAG_NO_WIPHY_MTX |		\
+		 NL80211_FLAG_MLO_UNSUPPORTED)		\
 	SELECTOR(__sel, NETDEV_UP_CLEAR,		\
 		 NL80211_FLAG_NEED_NETDEV_UP |		\
 		 NL80211_FLAG_CLEAR_SKB)		\
 	SELECTOR(__sel, WDEV_UP,			\
 		 NL80211_FLAG_NEED_WDEV_UP)		\
+	SELECTOR(__sel, WDEV_UP_LINK,			\
+		 NL80211_FLAG_NEED_WDEV_UP |		\
+		 NL80211_FLAG_MLO_VALID_LINK_ID)	\
 	SELECTOR(__sel, WDEV_UP_RTNL,			\
 		 NL80211_FLAG_NEED_WDEV_UP |		\
 		 NL80211_FLAG_NEED_RTNL)		\
@@ -15362,9 +15580,10 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 			    struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = NULL;
-	struct wireless_dev *wdev;
-	struct net_device *dev;
+	struct wireless_dev *wdev = NULL;
+	struct net_device *dev = NULL;
 	u32 internal_flags;
+	int err;
 
 	if (WARN_ON(ops->internal_flags >= ARRAY_SIZE(nl80211_internal_flags)))
 		return -EINVAL;
@@ -15375,8 +15594,8 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 	if (internal_flags & NL80211_FLAG_NEED_WIPHY) {
 		rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
 		if (IS_ERR(rdev)) {
-			rtnl_unlock();
-			return PTR_ERR(rdev);
+			err = PTR_ERR(rdev);
+			goto out_unlock;
 		}
 		info->user_ptr[0] = rdev;
 	} else if (internal_flags & NL80211_FLAG_NEED_NETDEV ||
@@ -15384,17 +15603,18 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info),
 						  info->attrs);
 		if (IS_ERR(wdev)) {
-			rtnl_unlock();
-			return PTR_ERR(wdev);
+			err = PTR_ERR(wdev);
+			goto out_unlock;
 		}
 
 		dev = wdev->netdev;
+		dev_hold(dev);
 		rdev = wiphy_to_rdev(wdev->wiphy);
 
 		if (internal_flags & NL80211_FLAG_NEED_NETDEV) {
 			if (!dev) {
-				rtnl_unlock();
-				return -EINVAL;
+				err = -EINVAL;
+				goto out_unlock;
 			}
 
 			info->user_ptr[1] = dev;
@@ -15404,14 +15624,44 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 		if (internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
 		    !wdev_running(wdev)) {
-			rtnl_unlock();
-			return -ENETDOWN;
+			err = -ENETDOWN;
+			goto out_unlock;
 		}
 
-		dev_hold(dev);
 		info->user_ptr[0] = rdev;
 	}
 
+	if (internal_flags & NL80211_FLAG_MLO_VALID_LINK_ID) {
+		struct nlattr *link_id = info->attrs[NL80211_ATTR_MLO_LINK_ID];
+
+		if (!wdev) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* MLO -> require valid link ID */
+		if (wdev->valid_links &&
+		    (!link_id ||
+		     !(wdev->valid_links & BIT(nla_get_u16(link_id))))) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* non-MLO -> no link ID attribute accepted */
+		if (!wdev->valid_links && link_id) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	if (internal_flags & NL80211_FLAG_MLO_UNSUPPORTED) {
+		if (info->attrs[NL80211_ATTR_MLO_LINK_ID] ||
+		    (wdev && wdev->valid_links)) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
 	if (rdev && !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
 		wiphy_lock(&rdev->wiphy);
 		/* we keep the mutex locked until post_doit */
@@ -15421,6 +15671,10 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		rtnl_unlock();
 
 	return 0;
+out_unlock:
+	rtnl_unlock();
+	dev_put(dev);
+	return err;
 }
 
 static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
@@ -15636,6 +15890,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_key,
 		.flags = GENL_UNS_ADMIN_PERM,
+		/* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on key */
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_CLEAR_SKB),
 	},
@@ -15659,21 +15914,24 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_set_beacon,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_START_AP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_start_ap,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_STOP_AP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.doit = nl80211_stop_ap,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_GET_STATION,
@@ -15939,7 +16197,9 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_remain_on_channel,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
+		/* FIXME: requiring a link ID here is probably not good */
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
@@ -15953,7 +16213,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_tx_bitrate_mask,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_REGISTER_FRAME,
@@ -16002,7 +16263,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_channel,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_JOIN_MESH,
@@ -16163,7 +16425,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_mac_acl,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_RADAR_DETECT,
@@ -16171,7 +16434,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.doit = nl80211_start_radar_detection,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
-					 NL80211_FLAG_NO_WIPHY_MTX),
+					 NL80211_FLAG_NO_WIPHY_MTX |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_GET_PROTOCOL_FEATURES,
@@ -16217,7 +16481,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_channel_switch,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_VENDOR,
@@ -16240,7 +16505,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_add_tx_ts,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_UNSUPPORTED),
 	},
 	{
 		.cmd = NL80211_CMD_DEL_TX_TS,
@@ -16301,7 +16567,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_get_ftm_responder_stats,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_PEER_MEASUREMENT_START,
@@ -16333,7 +16600,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.cmd = NL80211_CMD_SET_TID_CONFIG,
 		.doit = nl80211_set_tid_config,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_SET_SAR_SPECS,
@@ -16357,6 +16625,19 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
 	},
+	{
+		.cmd = NL80211_CMD_ADD_LINK,
+		.doit = nl80211_add_link,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+	},
+	{
+		.cmd = NL80211_CMD_REMOVE_LINK,
+		.doit = nl80211_remove_link,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
@@ -17984,23 +18265,37 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 }
 
 void cfg80211_ch_switch_notify(struct net_device *dev,
-			       struct cfg80211_chan_def *chandef)
+			       struct cfg80211_chan_def *chandef,
+			       unsigned int link_id)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 
 	ASSERT_WDEV_LOCK(wdev);
+	WARN_INVALID_LINK_ID(wdev, link_id);
 
-	trace_cfg80211_ch_switch_notify(dev, chandef);
-
-	wdev->chandef = *chandef;
-	wdev->preset_chandef = *chandef;
+	trace_cfg80211_ch_switch_notify(dev, chandef, link_id);
 
-	if ((wdev->iftype == NL80211_IFTYPE_STATION ||
-	     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
-	    !WARN_ON(!wdev->current_bss))
-		cfg80211_update_assoc_bss_entry(wdev, chandef->chan);
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		if (!WARN_ON(!wdev->links[link_id].client.current_bss))
+			cfg80211_update_assoc_bss_entry(wdev, link_id,
+							chandef->chan);
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		wdev->u.mesh.chandef = *chandef;
+		wdev->u.mesh.preset_chandef = *chandef;
+		break;
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		wdev->links[link_id].ap.chandef = *chandef;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
 
 	cfg80211_sched_dfs_chan_update(rdev);
 
diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c
index 2d26a6d980bf..27a1732264f9 100644
--- a/net/wireless/ocb.c
+++ b/net/wireless/ocb.c
@@ -4,6 +4,7 @@
  *
  * Copyright: (c) 2014 Czech Technical University in Prague
  *            (c) 2014 Volkswagen Group Research
+ * Copyright (C) 2022 Intel Corporation
  * Author:    Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
  * Funded by: Volkswagen Group Research
  */
@@ -34,7 +35,7 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
 
 	err = rdev_join_ocb(rdev, dev, setup);
 	if (!err)
-		wdev->chandef = setup->chandef;
+		wdev->u.ocb.chandef = setup->chandef;
 
 	return err;
 }
@@ -69,7 +70,7 @@ int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
 
 	err = rdev_leave_ocb(rdev, dev);
 	if (!err)
-		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+		memset(&wdev->u.ocb.chandef, 0, sizeof(wdev->u.ocb.chandef));
 
 	return err;
 }
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 439bcf52369c..d2300eff03ae 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright(c) 2016-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018, 2021-2022 Intel Corporation
+ */
 #ifndef __CFG80211_RDEV_OPS
 #define __CFG80211_RDEV_OPS
 
@@ -172,11 +177,11 @@ static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
 }
 
 static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
-			       struct net_device *dev)
+			       struct net_device *dev, unsigned int link_id)
 {
 	int ret;
-	trace_rdev_stop_ap(&rdev->wiphy, dev);
-	ret = rdev->ops->stop_ap(&rdev->wiphy, dev);
+	trace_rdev_stop_ap(&rdev->wiphy, dev, link_id);
+	ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
@@ -651,12 +656,14 @@ static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
 
 static inline int
 rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
-		      struct net_device *dev, const u8 *peer,
+		      struct net_device *dev, unsigned int link_id,
+		      const u8 *peer,
 		      const struct cfg80211_bitrate_mask *mask)
 {
 	int ret;
-	trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
-	ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
+	trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask);
+	ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id,
+					  peer, mask);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
@@ -944,12 +951,13 @@ static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
 static inline int
 rdev_get_channel(struct cfg80211_registered_device *rdev,
 		 struct wireless_dev *wdev,
+		 unsigned int link_id,
 		 struct cfg80211_chan_def *chandef)
 {
 	int ret;
 
-	trace_rdev_get_channel(&rdev->wiphy, wdev);
-	ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef);
+	trace_rdev_get_channel(&rdev->wiphy, wdev, link_id);
+	ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef);
 	trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);
 
 	return ret;
@@ -1107,12 +1115,14 @@ static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
 
 static inline int
 rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
-		      struct net_device *dev, struct cfg80211_chan_def *chandef)
+		      struct net_device *dev,
+		      unsigned int link_id,
+		      struct cfg80211_chan_def *chandef)
 {
 	int ret;
 
-	trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef);
-	ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef);
+	trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
+	ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 
 	return ret;
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 58e83ce642ad..c7383ede794f 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -2370,6 +2370,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	enum nl80211_iftype iftype;
 	bool ret;
+	int link;
 
 	wdev_lock(wdev);
 	iftype = wdev->iftype;
@@ -2378,62 +2379,83 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	if (!wdev->netdev || !netif_running(wdev->netdev))
 		goto wdev_inactive_unlock;
 
-	switch (iftype) {
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_P2P_GO:
-	case NL80211_IFTYPE_MESH_POINT:
-		if (!wdev->beacon_interval)
-			goto wdev_inactive_unlock;
-		chandef = wdev->chandef;
-		break;
-	case NL80211_IFTYPE_ADHOC:
-		if (!wdev->ssid_len)
-			goto wdev_inactive_unlock;
-		chandef = wdev->chandef;
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_P2P_CLIENT:
-		if (!wdev->current_bss ||
-		    !wdev->current_bss->pub.channel)
-			goto wdev_inactive_unlock;
-
-		if (!rdev->ops->get_channel ||
-		    rdev_get_channel(rdev, wdev, &chandef))
-			cfg80211_chandef_create(&chandef,
-						wdev->current_bss->pub.channel,
-						NL80211_CHAN_NO_HT);
-		break;
-	case NL80211_IFTYPE_MONITOR:
-	case NL80211_IFTYPE_AP_VLAN:
-	case NL80211_IFTYPE_P2P_DEVICE:
-		/* no enforcement required */
-		break;
-	default:
-		/* others not implemented for now */
-		WARN_ON(1);
-		break;
-	}
+	for (link = 0; link < ARRAY_SIZE(wdev->links); link++) {
+		struct ieee80211_channel *chan;
 
-	wdev_unlock(wdev);
+		if (!wdev->valid_links && link > 0)
+			break;
+		if (!(wdev->valid_links & BIT(link)))
+			continue;
+		switch (iftype) {
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_MESH_POINT:
+			if (!wdev->u.mesh.beacon_interval)
+				continue;
+			chandef = wdev->u.mesh.chandef;
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			if (!wdev->u.ibss.ssid_len)
+				continue;
+			chandef = wdev->u.ibss.chandef;
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			/* Maybe we could consider disabling that link only? */
+			if (!wdev->links[link].client.current_bss)
+				continue;
 
-	switch (iftype) {
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_P2P_GO:
-	case NL80211_IFTYPE_ADHOC:
-	case NL80211_IFTYPE_MESH_POINT:
-		wiphy_lock(wiphy);
-		ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype);
-		wiphy_unlock(wiphy);
+			chan = wdev->links[link].client.current_bss->pub.channel;
+			if (!chan)
+				continue;
 
-		return ret;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_P2P_CLIENT:
-		return cfg80211_chandef_usable(wiphy, &chandef,
-					       IEEE80211_CHAN_DISABLED);
-	default:
-		break;
+			if (!rdev->ops->get_channel ||
+			    rdev_get_channel(rdev, wdev, link, &chandef))
+				cfg80211_chandef_create(&chandef, chan,
+							NL80211_CHAN_NO_HT);
+			break;
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_P2P_DEVICE:
+			/* no enforcement required */
+			break;
+		default:
+			/* others not implemented for now */
+			WARN_ON(1);
+			break;
+		}
+
+		wdev_unlock(wdev);
+
+		switch (iftype) {
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_MESH_POINT:
+			wiphy_lock(wiphy);
+			ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef,
+							    iftype);
+			wiphy_unlock(wiphy);
+
+			if (!ret)
+				return ret;
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			ret = cfg80211_chandef_usable(wiphy, &chandef,
+						      IEEE80211_CHAN_DISABLED);
+			if (!ret)
+				return ret;
+			break;
+		default:
+			break;
+		}
+
+		wdev_lock(wdev);
 	}
 
+	wdev_unlock(wdev);
+
 	return true;
 
 wdev_inactive_unlock:
@@ -4215,8 +4237,17 @@ static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev)
 	 * In both cases we should end the CAC on the wdev.
 	 */
 	list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
-		if (wdev->cac_started &&
-		    !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef))
+		struct cfg80211_chan_def *chandef;
+
+		if (!wdev->cac_started)
+			continue;
+
+		/* FIXME: radar detection is tied to link 0 for now */
+		chandef = wdev_chandef(wdev, 0);
+		if (!chandef)
+			continue;
+
+		if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef))
 			rdev_end_cac(rdev, wdev->netdev);
 	}
 }
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 6d82bd9eaf8c..0134e5d5c81a 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -5,7 +5,7 @@
  * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2016	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -2617,7 +2617,8 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
 	spin_lock_bh(&rdev->bss_lock);
 
 	list_for_each_entry(bss, &rdev->bss_list, list) {
-		if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel))
+		if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel,
+						     false))
 			iter(wiphy, &bss->pub, iter_data);
 	}
 
@@ -2626,11 +2627,12 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
 EXPORT_SYMBOL(cfg80211_bss_iter);
 
 void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+				     unsigned int link_id,
 				     struct ieee80211_channel *chan)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
-	struct cfg80211_internal_bss *cbss = wdev->current_bss;
+	struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss;
 	struct cfg80211_internal_bss *new = NULL;
 	struct cfg80211_internal_bss *bss;
 	struct cfg80211_bss *nontrans_bss;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index ff4d48fcbfb2..35602201057b 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -5,7 +5,7 @@
  * (for nl80211's connect() and wext)
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved.
+ * Copyright (C) 2009, 2020, 2022 Intel Corporation. All rights reserved.
  * Copyright 2017	Intel Deutschland GmbH
  */
 
@@ -454,6 +454,20 @@ void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev)
 	schedule_work(&rdev->conn_work);
 }
 
+static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev)
+{
+	unsigned int link;
+
+	for_each_valid_link(wdev, link) {
+		if (!wdev->links[link].client.current_bss)
+			continue;
+		cfg80211_unhold_bss(wdev->links[link].client.current_bss);
+		cfg80211_put_bss(wdev->wiphy,
+				 &wdev->links[link].client.current_bss->pub);
+		wdev->links[link].client.current_bss = NULL;
+	}
+}
+
 static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev,
 				     const u8 *ies, size_t ies_len,
 				     const u8 **out_ies, size_t *out_ies_len)
@@ -521,12 +535,11 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 	if (!rdev->ops->auth || !rdev->ops->assoc)
 		return -EOPNOTSUPP;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
+	cfg80211_wdev_release_bsses(wdev);
 
+	if (wdev->connected) {
 		cfg80211_sme_free(wdev);
+		wdev->connected = false;
 	}
 
 	if (wdev->conn)
@@ -563,8 +576,8 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev,
 		wdev->conn->auto_auth = false;
 	}
 
-	wdev->conn->params.ssid = wdev->ssid;
-	wdev->conn->params.ssid_len = wdev->ssid_len;
+	wdev->conn->params.ssid = wdev->u.client.ssid;
+	wdev->conn->params.ssid_len = wdev->u.client.ssid_len;
 
 	/* see if we have the bss already */
 	bss = cfg80211_get_conn_bss(wdev);
@@ -648,7 +661,7 @@ static bool cfg80211_is_all_idle(void)
 	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
 		list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
 			wdev_lock(wdev);
-			if (wdev->conn || wdev->current_bss ||
+			if (wdev->conn || wdev->connected ||
 			    cfg80211_beaconing_iface_active(wdev))
 				is_all_idle = false;
 			wdev_unlock(wdev);
@@ -668,7 +681,6 @@ static void disconnect_work(struct work_struct *work)
 
 DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
 
-
 /*
  * API calls for drivers implementing connect/disconnect and
  * SME event handling
@@ -729,23 +741,19 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) {
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid,
-					   wdev->ssid, wdev->ssid_len,
+					   wdev->u.client.ssid, wdev->u.client.ssid_len,
 					   wdev->conn_bss_type,
 					   IEEE80211_PRIVACY_ANY);
 		if (cr->bss)
 			cfg80211_hold_bss(bss_from_pub(cr->bss));
 	}
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-		wdev->current_bss = NULL;
-	}
+	cfg80211_wdev_release_bsses(wdev);
 
 	if (cr->status != WLAN_STATUS_SUCCESS) {
 		kfree_sensitive(wdev->connect_keys);
 		wdev->connect_keys = NULL;
-		wdev->ssid_len = 0;
+		wdev->u.client.ssid_len = 0;
 		wdev->conn_owner_nlportid = 0;
 		if (cr->bss) {
 			cfg80211_unhold_bss(bss_from_pub(cr->bss));
@@ -758,7 +766,9 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (WARN_ON(!cr->bss))
 		return;
 
-	wdev->current_bss = bss_from_pub(cr->bss);
+	wdev->links[0].client.current_bss = bss_from_pub(cr->bss);
+	wdev->connected = true;
+	ether_addr_copy(wdev->u.client.connected_addr, cr->bss->bssid);
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
@@ -801,7 +811,7 @@ void cfg80211_connect_done(struct net_device *dev,
 
 			found = cfg80211_get_bss(wdev->wiphy, NULL,
 						 params->bss->bssid,
-						 wdev->ssid, wdev->ssid_len,
+						 wdev->u.client.ssid, wdev->u.client.ssid_len,
 						 wdev->conn_bss_type,
 						 IEEE80211_PRIVACY_ANY);
 			if (found) {
@@ -906,18 +916,17 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		goto out;
 
-	if (WARN_ON(!wdev->current_bss))
+	if (WARN_ON(!wdev->connected))
 		goto out;
 
-	cfg80211_unhold_bss(wdev->current_bss);
-	cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-	wdev->current_bss = NULL;
+	cfg80211_wdev_release_bsses(wdev);
 
 	if (WARN_ON(!info->bss))
 		return;
 
 	cfg80211_hold_bss(bss_from_pub(info->bss));
-	wdev->current_bss = bss_from_pub(info->bss);
+	wdev->links[0].client.current_bss = bss_from_pub(info->bss);
+	ether_addr_copy(wdev->u.client.connected_addr, info->bss->bssid);
 
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
@@ -963,8 +972,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 
 	if (!info->bss) {
 		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
-					     info->bssid, wdev->ssid,
-					     wdev->ssid_len,
+					     info->bssid, wdev->u.client.ssid,
+					     wdev->u.client.ssid_len,
 					     wdev->conn_bss_type,
 					     IEEE80211_PRIVACY_ANY);
 	}
@@ -1034,8 +1043,8 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return;
 
-	if (WARN_ON(!wdev->current_bss) ||
-	    WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+	if (WARN_ON(!wdev->connected) ||
+	    WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, bssid)))
 		return;
 
 	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
@@ -1087,13 +1096,9 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		return;
 
-	if (wdev->current_bss) {
-		cfg80211_unhold_bss(wdev->current_bss);
-		cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
-	}
-
-	wdev->current_bss = NULL;
-	wdev->ssid_len = 0;
+	cfg80211_wdev_release_bsses(wdev);
+	wdev->connected = false;
+	wdev->u.client.ssid_len = 0;
 	wdev->conn_owner_nlportid = 0;
 	kfree_sensitive(wdev->connect_keys);
 	wdev->connect_keys = NULL;
@@ -1182,19 +1187,20 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	 * already connected, so reject a new SSID unless it's the
 	 * same (which is the case for re-association.)
 	 */
-	if (wdev->ssid_len &&
-	    (wdev->ssid_len != connect->ssid_len ||
-	     memcmp(wdev->ssid, connect->ssid, wdev->ssid_len)))
+	if (wdev->u.client.ssid_len &&
+	    (wdev->u.client.ssid_len != connect->ssid_len ||
+	     memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len)))
 		return -EALREADY;
 
 	/*
 	 * If connected, reject (re-)association unless prev_bssid
 	 * matches the current BSSID.
 	 */
-	if (wdev->current_bss) {
+	if (wdev->connected) {
 		if (!prev_bssid)
 			return -EALREADY;
-		if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid))
+		if (!ether_addr_equal(prev_bssid,
+				      wdev->u.client.connected_addr))
 			return -ENOTCONN;
 	}
 
@@ -1245,8 +1251,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	}
 
 	wdev->connect_keys = connkeys;
-	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
-	wdev->ssid_len = connect->ssid_len;
+	memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len);
+	wdev->u.client.ssid_len = connect->ssid_len;
 
 	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
 					      IEEE80211_BSS_TYPE_ESS;
@@ -1262,8 +1268,8 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 		 * This could be reassoc getting refused, don't clear
 		 * ssid_len in that case.
 		 */
-		if (!wdev->current_bss)
-			wdev->ssid_len = 0;
+		if (!wdev->connected)
+			wdev->u.client.ssid_len = 0;
 		return err;
 	}
 
@@ -1287,7 +1293,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 		err = cfg80211_sme_disconnect(wdev, reason);
 	else if (!rdev->ops->disconnect)
 		cfg80211_mlme_down(rdev, dev);
-	else if (wdev->ssid_len)
+	else if (wdev->u.client.ssid_len)
 		err = rdev_disconnect(rdev, dev, reason);
 
 	/*
@@ -1295,8 +1301,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 	 * in which case cfg80211_disconnected() will take care of
 	 * this later.
 	 */
-	if (!wdev->current_bss)
-		wdev->ssid_len = 0;
+	if (!wdev->connected)
+		wdev->u.client.ssid_len = 0;
 
 	return err;
 }
@@ -1320,7 +1326,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 			break;
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
-			__cfg80211_stop_ap(rdev, wdev->netdev, false);
+			__cfg80211_stop_ap(rdev, wdev->netdev, -1, false);
 			break;
 		case NL80211_IFTYPE_MESH_POINT:
 			__cfg80211_leave_mesh(rdev, wdev->netdev);
@@ -1332,7 +1338,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 			 * ops->disconnect not implemented.  Otherwise we can
 			 * use cfg80211_disconnect.
 			 */
-			if (rdev->ops->disconnect || wdev->current_bss)
+			if (rdev->ops->disconnect || wdev->connected)
 				cfg80211_disconnect(rdev, wdev->netdev,
 						    WLAN_REASON_DEAUTH_LEAVING,
 						    true);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 228079d7690a..3b2c956b8d78 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -569,6 +569,7 @@ TRACE_EVENT(rdev_start_ap,
 		__field(bool, privacy)
 		__field(enum nl80211_auth_type, auth_type)
 		__field(int, inactivity_timeout)
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -583,16 +584,17 @@ TRACE_EVENT(rdev_start_ap,
 		__entry->inactivity_timeout = settings->inactivity_timeout;
 		memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
 		memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
+		__entry->link_id = settings->beacon.link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
 		  CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
 		  "hidden ssid: %d, wpa versions: %u, privacy: %s, "
-		  "auth type: %d, inactivity timeout: %d",
+		  "auth type: %d, inactivity timeout: %d, link_id: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
 		  __entry->beacon_interval, __entry->dtim_period,
 		  __entry->hidden_ssid, __entry->wpa_ver,
 		  BOOL_TO_STR(__entry->privacy), __entry->auth_type,
-		  __entry->inactivity_timeout)
+		  __entry->inactivity_timeout, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_change_beacon,
@@ -602,6 +604,7 @@ TRACE_EVENT(rdev_change_beacon,
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(int, link_id)
 		__dynamic_array(u8, head, info ? info->head_len : 0)
 		__dynamic_array(u8, tail, info ? info->tail_len : 0)
 		__dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0)
@@ -615,6 +618,7 @@ TRACE_EVENT(rdev_change_beacon,
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		if (info) {
+			__entry->link_id = info->link_id;
 			if (info->head)
 				memcpy(__get_dynamic_array(head), info->head,
 				       info->head_len);
@@ -635,9 +639,30 @@ TRACE_EVENT(rdev_change_beacon,
 			if (info->probe_resp)
 				memcpy(__get_dynamic_array(probe_resp),
 				       info->probe_resp, info->probe_resp_len);
+		} else {
+			__entry->link_id = -1;
 		}
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
+);
+
+TRACE_EVENT(rdev_stop_ap,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, netdev, link_id),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__field(unsigned int, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
 );
 
 DECLARE_EVENT_CLASS(wiphy_netdev_evt,
@@ -654,11 +679,6 @@ DECLARE_EVENT_CLASS(wiphy_netdev_evt,
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
 );
 
-DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
-	TP_ARGS(wiphy, netdev)
-);
-
 DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
 	TP_ARGS(wiphy, netdev)
@@ -1619,20 +1639,24 @@ TRACE_EVENT(rdev_testmode_dump,
 
 TRACE_EVENT(rdev_set_bitrate_mask,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id,
 		 const u8 *peer, const struct cfg80211_bitrate_mask *mask),
-	TP_ARGS(wiphy, netdev, peer, mask),
+	TP_ARGS(wiphy, netdev, link_id, peer, mask),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(unsigned int, link_id)
 		MAC_ENTRY(peer)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
 		MAC_ASSIGN(peer, peer);
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: " MAC_PR_FMT,
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
+		  MAC_PR_ARG(peer))
 );
 
 TRACE_EVENT(rdev_update_mgmt_frame_registrations,
@@ -2040,9 +2064,22 @@ TRACE_EVENT(rdev_set_noack_map,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
 );
 
-DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel,
-	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
-	TP_ARGS(wiphy, wdev)
+TRACE_EVENT(rdev_get_channel,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 unsigned int link_id),
+	TP_ARGS(wiphy, wdev, link_id),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(unsigned int, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->link_id = link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_return_chandef,
@@ -2296,20 +2333,24 @@ TRACE_EVENT(rdev_set_qos_map,
 
 TRACE_EVENT(rdev_set_ap_chanwidth,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 unsigned int link_id,
 		 struct cfg80211_chan_def *chandef),
-	TP_ARGS(wiphy, netdev, chandef),
+	TP_ARGS(wiphy, netdev, link_id, chandef),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
+		__entry->link_id = link_id;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+		  __entry->link_id)
 );
 
 TRACE_EVENT(rdev_add_tx_ts,
@@ -3022,18 +3063,21 @@ TRACE_EVENT(cfg80211_chandef_dfs_required,
 
 TRACE_EVENT(cfg80211_ch_switch_notify,
 	TP_PROTO(struct net_device *netdev,
-		 struct cfg80211_chan_def *chandef),
-	TP_ARGS(netdev, chandef),
+		 struct cfg80211_chan_def *chandef,
+		 unsigned int link_id),
+	TP_ARGS(netdev, chandef, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
+		__field(unsigned int, link_id)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
+		__entry->link_id = link_id;
 	),
-	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
-		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_ch_switch_started_notify,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index a60d7d638e72..b7257862e0fe 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -1041,7 +1041,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			return -EBUSY;
 
 		dev->ieee80211_ptr->use_4addr = false;
-		dev->ieee80211_ptr->mesh_id_up_len = 0;
 		wdev_lock(dev->ieee80211_ptr);
 		rdev_set_qos_map(rdev, dev, NULL);
 		wdev_unlock(dev->ieee80211_ptr);
@@ -1049,7 +1048,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 		switch (otype) {
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_P2P_GO:
-			cfg80211_stop_ap(rdev, dev, true);
+			cfg80211_stop_ap(rdev, dev, -1, true);
 			break;
 		case NL80211_IFTYPE_ADHOC:
 			cfg80211_leave_ibss(rdev, dev, false);
@@ -1073,6 +1072,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 
 		cfg80211_process_rdev_events(rdev);
 		cfg80211_mlme_purge_registrations(dev->ieee80211_ptr);
+
+		memset(&dev->ieee80211_ptr->u, 0,
+		       sizeof(dev->ieee80211_ptr->u));
+		memset(&dev->ieee80211_ptr->links, 0,
+		       sizeof(dev->ieee80211_ptr->links));
 	}
 
 	err = rdev_change_virtual_intf(rdev, dev, ntype, params);
@@ -1930,6 +1934,24 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 }
 EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
 
+static int cfg80211_wdev_bi(struct wireless_dev *wdev)
+{
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+		WARN_ON(wdev->valid_links);
+		return wdev->links[0].ap.beacon_interval;
+	case NL80211_IFTYPE_MESH_POINT:
+		return wdev->u.mesh.beacon_interval;
+	case NL80211_IFTYPE_ADHOC:
+		return wdev->u.ibss.beacon_interval;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
 				       u32 *beacon_int_gcd,
 				       bool *beacon_int_different)
@@ -1940,19 +1962,27 @@ static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
 	*beacon_int_different = false;
 
 	list_for_each_entry(wdev, &wiphy->wdev_list, list) {
-		if (!wdev->beacon_interval)
+		int wdev_bi;
+
+		/* this feature isn't supported with MLO */
+		if (wdev->valid_links)
+			continue;
+
+		wdev_bi = cfg80211_wdev_bi(wdev);
+
+		if (!wdev_bi)
 			continue;
 
 		if (!*beacon_int_gcd) {
-			*beacon_int_gcd = wdev->beacon_interval;
+			*beacon_int_gcd = wdev_bi;
 			continue;
 		}
 
-		if (wdev->beacon_interval == *beacon_int_gcd)
+		if (wdev_bi == *beacon_int_gcd)
 			continue;
 
 		*beacon_int_different = true;
-		*beacon_int_gcd = gcd(*beacon_int_gcd, wdev->beacon_interval);
+		*beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi);
 	}
 
 	if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index a32065d600a1..a9767bfe7330 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -7,7 +7,7 @@
  * we directly assign the wireless handlers of wireless interfaces.
  *
  * Copyright 2008-2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
  */
 
 #include <linux/export.h>
@@ -415,6 +415,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	int err, i;
 	bool rejoin = false;
 
+	if (wdev->valid_links)
+		return -EINVAL;
+
 	if (pairwise && !addr)
 		return -EINVAL;
 
@@ -437,7 +440,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		return -EOPNOTSUPP;
 
 	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
-		if (!wdev->current_bss)
+		if (!wdev->connected)
 			return -ENOLINK;
 
 		if (!rdev->ops->set_default_mgmt_key)
@@ -450,7 +453,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 	if (remove) {
 		err = 0;
-		if (wdev->current_bss) {
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss)) {
 			/*
 			 * If removing the current TX key, we will need to
 			 * join a new IBSS without the privacy bit clear.
@@ -501,7 +506,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 
 	err = 0;
-	if (wdev->current_bss)
+	if (wdev->connected ||
+	    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+	     wdev->u.ibss.current_bss))
 		err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
 	else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
 		 params->cipher != WLAN_CIPHER_SUITE_WEP104)
@@ -526,7 +533,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
 	     params->cipher == WLAN_CIPHER_SUITE_WEP104) &&
 	    (tx_key || (!addr && wdev->wext.default_key == -1))) {
-		if (wdev->current_bss) {
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss)) {
 			/*
 			 * If we are getting a new TX key from not having
 			 * had one before we need to join a new IBSS with
@@ -549,7 +558,9 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC &&
 	    (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) {
-		if (wdev->current_bss)
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss))
 			err = rdev_set_default_mgmt_key(rdev, dev, idx);
 		if (!err)
 			wdev->wext.default_mgmt_key = idx;
@@ -595,6 +606,11 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	wiphy_lock(&rdev->wiphy);
+	if (wdev->valid_links) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	idx = erq->flags & IW_ENCODE_INDEX;
 	if (idx == 0) {
 		idx = wdev->wext.default_key;
@@ -613,7 +629,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 		/* No key data - just set the default TX key index */
 		err = 0;
 		wdev_lock(wdev);
-		if (wdev->current_bss)
+		if (wdev->connected ||
+		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+		     wdev->u.ibss.current_bss))
 			err = rdev_set_default_key(rdev, dev, idx, true,
 						   true);
 		if (!err)
@@ -865,7 +883,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev,
 			break;
 		}
 
-		ret = rdev_get_channel(rdev, wdev, &chandef);
+		ret = rdev_get_channel(rdev, wdev, 0, &chandef);
 		if (ret)
 			break;
 		freq->m = chandef.chan->center_freq;
@@ -1270,7 +1288,10 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
 		return -EINVAL;
 
 	wiphy_lock(&rdev->wiphy);
-	ret = rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+	if (dev->ieee80211_ptr->valid_links)
+		ret = -EOPNOTSUPP;
+	else
+		ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask);
 	wiphy_unlock(&rdev->wiphy);
 
 	return ret;
@@ -1294,8 +1315,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 
 	err = 0;
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (!wdev->valid_links && wdev->links[0].client.current_bss)
+		memcpy(addr, wdev->links[0].client.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else
 		err = -EOPNOTSUPP;
 	wdev_unlock(wdev);
@@ -1339,11 +1361,11 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 
 	/* Grab BSSID of current BSS, if any */
 	wdev_lock(wdev);
-	if (!wdev->current_bss) {
+	if (wdev->valid_links || !wdev->links[0].client.current_bss) {
 		wdev_unlock(wdev);
 		return NULL;
 	}
-	memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+	memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN);
 	wdev_unlock(wdev);
 
 	memset(&sinfo, 0, sizeof(sinfo));
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index cd09a9042261..68f45afc352d 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -3,7 +3,7 @@
  * cfg80211 wext compat for managed mode.
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009, 2020-2021 Intel Corporation.
+ * Copyright (C) 2009, 2020-2022 Intel Corporation
  */
 
 #include <linux/export.h>
@@ -124,9 +124,12 @@ int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
+	if (wdev->valid_links)
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		chan = wdev->current_bss->pub.channel;
+	if (wdev->links[0].client.current_bss)
+		chan = wdev->links[0].client.current_bss->pub.channel;
 	else if (wdev->wext.connect.channel)
 		chan = wdev->wext.connect.channel;
 	wdev_unlock(wdev);
@@ -208,15 +211,19 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
+	if (wdev->valid_links)
+		return -EINVAL;
+
 	data->flags = 0;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss) {
+	if (wdev->links[0].client.current_bss) {
 		const struct element *ssid_elem;
 
 		rcu_read_lock();
-		ssid_elem = ieee80211_bss_get_elem(&wdev->current_bss->pub,
-						   WLAN_EID_SSID);
+		ssid_elem = ieee80211_bss_get_elem(
+				&wdev->links[0].client.current_bss->pub,
+				WLAN_EID_SSID);
 		if (ssid_elem) {
 			data->flags = 1;
 			data->length = ssid_elem->datalen;
@@ -300,8 +307,14 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev,
 	ap_addr->sa_family = ARPHRD_ETHER;
 
 	wdev_lock(wdev);
-	if (wdev->current_bss)
-		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	if (wdev->valid_links) {
+		wdev_unlock(wdev);
+		return -EOPNOTSUPP;
+	}
+	if (wdev->links[0].client.current_bss)
+		memcpy(ap_addr->sa_data,
+		       wdev->links[0].client.current_bss->pub.bssid,
+		       ETH_ALEN);
 	else
 		eth_zero_addr(ap_addr->sa_data);
 	wdev_unlock(wdev);
-- 
cgit v1.2.3


From d648c23024bd01333acd2fd5e34bcde0ffb66b16 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 31 May 2022 19:48:33 +0200
Subject: wifi: nl80211: support MLO in auth/assoc

For authentication, we need the BSS, the link_id and the AP
MLD address to create the link and station, (for now) the
driver assigns a link address and sends the frame, the MLD
address needs to be the address of the interface.

For association, pass the list of BSSes that were selected
for the MLO connection, along with extra per-STA profile
elements, the AP MLD address and the link ID on which the
association request should be sent.

Note that for now we don't have a proper way to pass the link
address(es) and so the driver/mac80211 will select one, but
depending on how that selection works it means that assoc w/o
auth data still being around (mac80211 implementation detail)
the association won't necessarily work - so this will need to
be extended in the future to sort out the link addressing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 31 +++++++++++++++
 include/uapi/linux/nl80211.h |  3 ++
 net/wireless/mlme.c          | 30 +++++++++++++--
 net/wireless/nl80211.c       | 91 ++++++++++++++++++++++++++++++++++++++++++--
 net/wireless/sme.c           |  2 +
 5 files changed, 151 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 772e099fc932..a4f9e6094118 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2718,6 +2718,12 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
  *	Authentication algorithm number, i.e., starting at the Authentication
  *	transaction sequence number field.
  * @auth_data_len: Length of auth_data buffer in octets
+ * @link_id: if >= 0, indicates authentication should be done as an MLD,
+ *	the interface address is included as the MLD address and the
+ *	necessary link (with the given link_id) will be created (and
+ *	given an MLD address) by the driver
+ * @ap_mld_addr: AP MLD address in case of authentication request with
+ *	an AP MLD, valid iff @link_id >= 0
  */
 struct cfg80211_auth_request {
 	struct cfg80211_bss *bss;
@@ -2728,6 +2734,21 @@ struct cfg80211_auth_request {
 	u8 key_len, key_idx;
 	const u8 *auth_data;
 	size_t auth_data_len;
+	s8 link_id;
+	const u8 *ap_mld_addr;
+};
+
+/**
+ * struct cfg80211_assoc_link - per-link information for MLO association
+ * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss;
+ *	if this is %NULL for a link, that link is not requested
+ * @elems: extra elements for the per-STA profile for this link
+ * @elems_len: length of the elements
+ */
+struct cfg80211_assoc_link {
+	struct cfg80211_bss *bss;
+	const u8 *elems;
+	size_t elems_len;
 };
 
 /**
@@ -2761,6 +2782,8 @@ enum cfg80211_assoc_req_flags {
  *	given a reference that it must give back to cfg80211_send_rx_assoc()
  *	or to cfg80211_assoc_timeout(). To ensure proper refcounting, new
  *	association requests while already associating must be rejected.
+ *	This also applies to the @links.bss parameter, which is used instead
+ *	of this one (it is %NULL) for MLO associations.
  * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
  * @ie_len: Length of ie buffer in octets
  * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
@@ -2785,6 +2808,11 @@ enum cfg80211_assoc_req_flags {
  *	with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
  * @s1g_capa: S1G capability override
  * @s1g_capa_mask: S1G capability override mask
+ * @links: per-link information for MLO connections
+ * @link_id: >= 0 for MLO connections, where links are given, and indicates
+ *	the link on which the association request should be sent
+ * @ap_mld_addr: AP MLD address in case of MLO association request,
+ *	valid iff @link_id >= 0
  */
 struct cfg80211_assoc_request {
 	struct cfg80211_bss *bss;
@@ -2800,6 +2828,9 @@ struct cfg80211_assoc_request {
 	size_t fils_kek_len;
 	const u8 *fils_nonces;
 	struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask;
+	struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS];
+	const u8 *ap_mld_addr;
+	s8 link_id;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a9a2c9fef295..60ad9a9f153d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2687,6 +2687,8 @@ enum nl80211_commands {
  *	various commands that need a link ID to operate.
  * @NL80211_ATTR_MLO_LINKS: A nested array of links, each containing some
  *	per-link information and a link ID.
+ * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as
+ *	authenticate/associate.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -3204,6 +3206,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MLO_LINKS,
 	NL80211_ATTR_MLO_LINK_ID,
+	NL80211_ATTR_MLD_ADDR,
 
 	/* add attributes here, update the policy in nl80211.c */
 
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 532113937469..d92eed0e52cd 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -241,6 +241,10 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 	if (!req->bss)
 		return -ENOENT;
 
+	if (req->link_id >= 0 &&
+	    !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		return -EINVAL;
+
 	if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) {
 		if (!req->key || !req->key_len ||
 		    req->key_idx < 0 || req->key_idx > 3)
@@ -294,10 +298,19 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 			struct cfg80211_assoc_request *req)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
+	int err, i, j;
 
 	ASSERT_WDEV_LOCK(wdev);
 
+	for (i = 1; i < ARRAY_SIZE(req->links); i++) {
+		if (!req->links[i].bss)
+			continue;
+		for (j = 0; j < i; j++) {
+			if (req->links[i].bss == req->links[j].bss)
+				return -EINVAL;
+		}
+	}
+
 	if (wdev->connected &&
 	    (!req->prev_bssid ||
 	     !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
@@ -310,8 +323,19 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 
 	err = rdev_assoc(rdev, dev, req);
 	if (!err) {
-		cfg80211_ref_bss(&rdev->wiphy, req->bss);
-		cfg80211_hold_bss(bss_from_pub(req->bss));
+		int link_id;
+
+		if (req->bss) {
+			cfg80211_ref_bss(&rdev->wiphy, req->bss);
+			cfg80211_hold_bss(bss_from_pub(req->bss));
+		}
+
+		for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) {
+			if (!req->links[link_id].bss)
+				continue;
+			cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss);
+			cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss));
+		}
 	}
 	return err;
 }
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5a4d3ddcdf80..9bc66a21ac3a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -796,6 +796,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_NESTED_ARRAY(nl80211_policy),
 	[NL80211_ATTR_MLO_LINK_ID] =
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
+	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
 };
 
 /* policy for the key attributes */
@@ -10282,6 +10283,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	req.key = key.p.key;
 	req.key_len = key.p.key_len;
 	req.key_idx = key.idx;
+	req.link_id = nl80211_link_id_or_invalid(info->attrs);
+	if (req.link_id >= 0) {
+		if (!info->attrs[NL80211_ATTR_MLD_ADDR])
+			return -EINVAL;
+		req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+	}
 
 	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
 				   IEEE80211_BSS_TYPE_ESS,
@@ -10475,7 +10482,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_assoc_request req = {};
+	struct nlattr **attrs = NULL;
 	const u8 *bssid, *ssid;
+	unsigned int link_id;
 	int err, ssid_len;
 
 	if (dev->ieee80211_ptr->conn_owner_nlportid &&
@@ -10585,9 +10594,81 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		       sizeof(req.s1g_capa));
 	}
 
-	req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs, &bssid);
-	if (IS_ERR(req.bss))
-		return PTR_ERR(req.bss);
+	req.link_id = nl80211_link_id_or_invalid(info->attrs);
+
+	if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
+		unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs);
+		struct nlattr *link;
+		int rem = 0;
+
+		if (req.link_id < 0)
+			return -EINVAL;
+
+		if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO))
+			return -EINVAL;
+
+		if (info->attrs[NL80211_ATTR_MAC] ||
+		    info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+		    !info->attrs[NL80211_ATTR_MLD_ADDR])
+			return -EINVAL;
+
+		req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+
+		attrs = kzalloc(attrsize, GFP_KERNEL);
+		if (!attrs)
+			return -ENOMEM;
+
+		nla_for_each_nested(link,
+				    info->attrs[NL80211_ATTR_MLO_LINKS],
+				    rem) {
+			memset(attrs, 0, attrsize);
+
+			nla_parse_nested(attrs, NL80211_ATTR_MAX,
+					 link, NULL, NULL);
+
+			if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
+				err = -EINVAL;
+				goto free;
+			}
+
+			link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]);
+			/* cannot use the same link ID again */
+			if (req.links[link_id].bss) {
+				err = -EINVAL;
+				goto free;
+			}
+			req.links[link_id].bss =
+				nl80211_assoc_bss(rdev, ssid, ssid_len, attrs,
+						  &bssid);
+			if (IS_ERR(req.links[link_id].bss)) {
+				err = PTR_ERR(req.links[link_id].bss);
+				goto free;
+			}
+
+			if (attrs[NL80211_ATTR_IE]) {
+				req.links[link_id].elems =
+					nla_data(attrs[NL80211_ATTR_IE]);
+				req.links[link_id].elems_len =
+					nla_len(attrs[NL80211_ATTR_IE]);
+			}
+		}
+
+		if (!req.links[req.link_id].bss) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		kfree(attrs);
+		attrs = NULL;
+	} else {
+		if (req.link_id >= 0)
+			return -EINVAL;
+
+		req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs,
+					    &bssid);
+		if (IS_ERR(req.bss))
+			return PTR_ERR(req.bss);
+	}
 
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
@@ -10605,7 +10686,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		wdev_unlock(dev->ieee80211_ptr);
 	}
 
+free:
+	for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++)
+		cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss);
 	cfg80211_put_bss(&rdev->wiphy, req.bss);
+	kfree(attrs);
 
 	return err;
 }
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index c8a99b90723b..b3c6ce4f85ee 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -177,6 +177,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 						params->ssid, params->ssid_len,
 						IEEE80211_BSS_TYPE_ESS,
 						IEEE80211_PRIVACY_ANY);
+		auth_req.link_id = -1;
 		err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req);
 		cfg80211_put_bss(&rdev->wiphy, auth_req.bss);
 		return err;
@@ -198,6 +199,7 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 		req.ht_capa_mask = params->ht_capa_mask;
 		req.vht_capa = params->vht_capa;
 		req.vht_capa_mask = params->vht_capa_mask;
+		req.link_id = -1;
 
 		req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel,
 					   params->bssid,
-- 
cgit v1.2.3


From efbabc11650040c64884ff3019b88c7bcc0ceb1d Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Wed, 8 Jun 2022 14:46:37 +0530
Subject: cfg80211: Indicate MLO connection info in connect and roam callbacks

The MLO links used for connection with an MLD AP are decided by the
driver in case of SME offloaded to driver.

Add support for the drivers to indicate the information of links used
for MLO connection in connect and roam callbacks, update the connected
links information in wdev from connect/roam result sent by driver.
Also, send the connected links information to userspace.

Add a netlink flag attribute to indicate that userspace supports
handling of MLO connection. Drivers must not do MLO connection when this
flag is not set. This is to maintain backwards compatibility with older
supplicant versions which doesn't have support for MLO connection.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   2 +-
 drivers/net/wireless/ath/wil6210/wmi.c             |   4 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   6 +-
 drivers/net/wireless/rndis_wlan.c                  |   5 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |   4 +-
 drivers/staging/wlan-ng/cfg80211.c                 |   2 +-
 include/net/cfg80211.h                             |  84 +++--
 include/uapi/linux/nl80211.h                       |   6 +
 net/wireless/mlme.c                                |   4 +-
 net/wireless/nl80211.c                             | 120 ++++++-
 net/wireless/sme.c                                 | 391 +++++++++++++++------
 11 files changed, 480 insertions(+), 148 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 33ed54738d47..e11c7e9accc0 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -807,7 +807,7 @@ void ath6kl_cfg80211_connect_event(struct ath6kl_vif *vif, u16 channel,
 		cfg80211_put_bss(ar->wiphy, bss);
 	} else if (vif->sme_state == SME_CONNECTED) {
 		struct cfg80211_roam_info roam_info = {
-			.bss = bss,
+			.links[0].bss = bss,
 			.req_ie = assoc_req_ie,
 			.req_ie_len = assoc_req_len,
 			.resp_ie = assoc_resp_ie,
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index 98b4c189eecc..ea7bd403e706 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1822,8 +1822,8 @@ wmi_evt_reassoc_status(struct wil6210_vif *vif, int id, void *d, int len)
 	freq = ieee80211_channel_to_frequency(ch, NL80211_BAND_60GHZ);
 
 	memset(&info, 0, sizeof(info));
-	info.channel = ieee80211_get_channel(wiphy, freq);
-	info.bss = vif->bss;
+	info.links[0].channel = ieee80211_get_channel(wiphy, freq);
+	info.links[0].bss = vif->bss;
 	info.req_ie = assoc_req_ie;
 	info.req_ie_len = assoc_req_ie_len;
 	info.resp_ie = assoc_resp_ie;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 11e1f07f83e0..3ae6779fe153 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6017,8 +6017,8 @@ brcmf_bss_roaming_done(struct brcmf_cfg80211_info *cfg,
 done:
 	kfree(buf);
 
-	roam_info.channel = notify_channel;
-	roam_info.bssid = profile->bssid;
+	roam_info.links[0].channel = notify_channel;
+	roam_info.links[0].bssid = profile->bssid;
 	roam_info.req_ie = conn_info->req_ie;
 	roam_info.req_ie_len = conn_info->req_ie_len;
 	roam_info.resp_ie = conn_info->resp_ie;
@@ -6061,7 +6061,7 @@ brcmf_bss_connect_done(struct brcmf_cfg80211_info *cfg,
 		} else {
 			conn_params.status = WLAN_STATUS_AUTH_TIMEOUT;
 		}
-		conn_params.bssid = profile->bssid;
+		conn_params.links[0].bssid = profile->bssid;
 		conn_params.req_ie = conn_info->req_ie;
 		conn_params.req_ie_len = conn_info->req_ie_len;
 		conn_params.resp_ie = conn_info->resp_ie;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index ff2448394a1e..05524291d60c 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -2813,8 +2813,9 @@ static void rndis_wlan_do_link_up_work(struct usbnet *usbdev)
 						resp_ie_len, 0, GFP_KERNEL);
 		} else {
 			struct cfg80211_roam_info roam_info = {
-				.channel = get_current_channel(usbdev, NULL),
-				.bssid = bssid,
+				.links[0].channel =
+					get_current_channel(usbdev, NULL),
+				.links[0].bssid = bssid,
 				.req_ie = req_ie,
 				.req_ie_len = req_ie_len,
 				.resp_ie = resp_ie,
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 349aa3c4b668..cf35125b7891 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -450,8 +450,8 @@ check_bss:
 
 		notify_channel = ieee80211_get_channel(wiphy, freq);
 
-		roam_info.channel = notify_channel;
-		roam_info.bssid = cur_network->network.mac_address;
+		roam_info.links[0].channel = notify_channel;
+		roam_info.links[0].bssid = cur_network->network.mac_address;
 		roam_info.req_ie =
 			pmlmepriv->assoc_req+sizeof(struct ieee80211_hdr_3addr)+2;
 		roam_info.req_ie_len =
diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c
index 87379edce9a8..b7b56d8406d1 100644
--- a/drivers/staging/wlan-ng/cfg80211.c
+++ b/drivers/staging/wlan-ng/cfg80211.c
@@ -645,7 +645,7 @@ void prism2_disconnected(struct wlandevice *wlandev)
 void prism2_roamed(struct wlandevice *wlandev)
 {
 	struct cfg80211_roam_info roam_info = {
-		.bssid = wlandev->bssid,
+		.links[0].bssid = wlandev->bssid,
 	};
 
 	cfg80211_roamed(wlandev->netdev, &roam_info, GFP_KERNEL);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5706f96b819a..996782c44838 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2763,6 +2763,9 @@ struct cfg80211_assoc_link {
  *	request (connect callback).
  * @ASSOC_REQ_DISABLE_HE:  Disable HE
  * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
+ * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links.
+ *	Drivers shall disable MLO features for the current association if this
+ *	flag is not set.
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -2771,6 +2774,7 @@ enum cfg80211_assoc_req_flags {
 	CONNECT_REQ_EXTERNAL_AUTH_SUPPORT	= BIT(3),
 	ASSOC_REQ_DISABLE_HE			= BIT(4),
 	ASSOC_REQ_DISABLE_EHT			= BIT(5),
+	CONNECT_REQ_MLO_SUPPORT			= BIT(6),
 };
 
 /**
@@ -5780,12 +5784,13 @@ static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
 		!(wdev->valid_links & BIT(link_id)));
 }
 
-#define for_each_valid_link(wdev, link_id)					\
-	for (link_id = 0;							\
-	     link_id < ((wdev)->valid_links ? ARRAY_SIZE((wdev)->links) : 1);	\
-	     link_id++)								\
-		if (!(wdev)->valid_links ||					\
-		    ((wdev)->valid_links & BIT(link_id)))
+#define for_each_valid_link(link_info, link_id)			\
+	for (link_id = 0;					\
+	     link_id < ((link_info)->valid_links ?		\
+			ARRAY_SIZE((link_info)->links) : 1);	\
+	     link_id++)						\
+		if (!(link_info)->valid_links ||		\
+		    ((link_info)->valid_links & BIT(link_id)))
 
 /**
  * DOC: Utility functions
@@ -7296,13 +7301,6 @@ struct cfg80211_fils_resp_params {
  *	indicate that this is a failure, but without a status code.
  *	@timeout_reason is used to report the reason for the timeout in that
  *	case.
- * @bssid: The BSSID of the AP (may be %NULL)
- * @bss: Entry of bss to which STA got connected to, can be obtained through
- *	cfg80211_get_bss() (may be %NULL). But it is recommended to store the
- *	bss from the connect_request and hold a reference to it and return
- *	through this param to avoid a warning if the bss is expired during the
- *	connection, esp. for those drivers implementing connect op.
- *	Only one parameter among @bssid and @bss needs to be specified.
  * @req_ie: Association request IEs (may be %NULL)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
@@ -7314,17 +7312,41 @@ struct cfg80211_fils_resp_params {
  *	not known. This value is used only if @status < 0 to indicate that the
  *	failure is due to a timeout and not due to explicit rejection by the AP.
  *	This value is ignored in other cases (@status >= 0).
+ * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise
+ *	zero.
+ * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL.
+ * @links : For MLO connection, contains link info for the valid links indicated
+ *	using @valid_links. For non-MLO connection, links[0] contains the
+ *	connected AP info.
+ * @links.addr: For MLO connection, MAC address of the STA link. Otherwise
+ *	%NULL.
+ * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO
+ *	connection, links[0].bssid points to the BSSID of the AP (may be %NULL).
+ * @links.bss: For MLO connection, entry of bss to which STA link is connected.
+ *	For non-MLO connection, links[0].bss points to entry of bss to which STA
+ *	is connected. It can be obtained through cfg80211_get_bss() (may be
+ *	%NULL). It is recommended to store the bss from the connect_request and
+ *	hold a reference to it and return through this param to avoid a warning
+ *	if the bss is expired during the connection, esp. for those drivers
+ *	implementing connect op. Only one parameter among @bssid and @bss needs
+ *	to be specified.
  */
 struct cfg80211_connect_resp_params {
 	int status;
-	const u8 *bssid;
-	struct cfg80211_bss *bss;
 	const u8 *req_ie;
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
 	struct cfg80211_fils_resp_params fils;
 	enum nl80211_timeout_reason timeout_reason;
+
+	const u8 *ap_mld_addr;
+	u16 valid_links;
+	struct {
+		const u8 *addr;
+		const u8 *bssid;
+		struct cfg80211_bss *bss;
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
@@ -7394,8 +7416,8 @@ cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
 
 	memset(&params, 0, sizeof(params));
 	params.status = status;
-	params.bssid = bssid;
-	params.bss = bss;
+	params.links[0].bssid = bssid;
+	params.links[0].bss = bss;
 	params.req_ie = req_ie;
 	params.req_ie_len = req_ie_len;
 	params.resp_ie = resp_ie;
@@ -7466,24 +7488,40 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
 /**
  * struct cfg80211_roam_info - driver initiated roaming information
  *
- * @channel: the channel of the new AP
- * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set)
- * @bssid: the BSSID of the new AP (may be %NULL if %bss is set)
  * @req_ie: association request IEs (maybe be %NULL)
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
  * @fils: FILS related roaming information.
+ * @valid_links: For MLO roaming, BIT mask of the new valid links is set.
+ *	Otherwise zero.
+ * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL.
+ * @links : For MLO roaming, contains new link info for the valid links set in
+ *	@valid_links. For non-MLO roaming, links[0] contains the new AP info.
+ * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL.
+ * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO
+ *	roaming, links[0].bssid points to the BSSID of the new AP. May be
+ *	%NULL if %links.bss is set.
+ * @links.channel: the channel of the new AP.
+ * @links.bss: For MLO roaming, entry of new bss to which STA link got
+ *	roamed. For non-MLO roaming, links[0].bss points to entry of bss to
+ *	which STA got roamed (may be %NULL if %links.bssid is set)
  */
 struct cfg80211_roam_info {
-	struct ieee80211_channel *channel;
-	struct cfg80211_bss *bss;
-	const u8 *bssid;
 	const u8 *req_ie;
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
 	struct cfg80211_fils_resp_params fils;
+
+	const u8 *ap_mld_addr;
+	u16 valid_links;
+	struct {
+		const u8 *addr;
+		const u8 *bssid;
+		struct ieee80211_channel *channel;
+		struct cfg80211_bss *bss;
+	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 60ad9a9f153d..89f64f46b98d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2690,6 +2690,10 @@ enum nl80211_commands {
  * @NL80211_ATTR_MLD_ADDR: An MLD address, used with various commands such as
  *	authenticate/associate.
  *
+ * @NL80211_ATTR_MLO_SUPPORT: Flag attribute to indicate user space supports MLO
+ *	connection. Used with %NL80211_CMD_CONNECT. If this attribute is not
+ *	included in NL80211_CMD_CONNECT drivers must not perform MLO connection.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3208,6 +3212,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MLO_LINK_ID,
 	NL80211_ATTR_MLD_ADDR,
 
+	NL80211_ATTR_MLO_SUPPORT,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index d92eed0e52cd..8a84cf77667c 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -42,8 +42,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
 
 	memset(&cr, 0, sizeof(cr));
 	cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code);
-	cr.bssid = mgmt->bssid;
-	cr.bss = bss;
+	cr.links[0].bssid = mgmt->bssid;
+	cr.links[0].bss = bss;
 	cr.req_ie = req_ies;
 	cr.req_ie_len = req_ies_len;
 	cr.resp_ie = resp_ie;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index aca799b9971e..6a45801c783c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -797,6 +797,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_LINK_ID] =
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
+	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -11529,6 +11530,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT;
 	}
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_MLO_SUPPORT]))
+		connect.flags |= CONNECT_REQ_MLO_SUPPORT;
+
 	wdev_lock(dev->ieee80211_ptr);
 
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
@@ -17304,10 +17308,29 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 {
 	struct sk_buff *msg;
 	void *hdr;
+	unsigned int link;
+	size_t link_info_size = 0;
+	const u8 *connected_addr = cr->valid_links ?
+				   cr->ap_mld_addr : cr->links[0].bssid;
+
+	if (cr->valid_links) {
+		for_each_valid_link(cr, link) {
+			/* Nested attribute header */
+			link_info_size += NLA_HDRLEN;
+			/* Link ID */
+			link_info_size += nla_total_size(sizeof(u8));
+			link_info_size += cr->links[link].addr ?
+					  nla_total_size(ETH_ALEN) : 0;
+			link_info_size += (cr->links[link].bssid ||
+					   cr->links[link].bss) ?
+					  nla_total_size(ETH_ALEN) : 0;
+		}
+	}
 
 	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
 			cr->fils.kek_len + cr->fils.pmk_len +
-			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size,
+			gfp);
 	if (!msg)
 		return;
 
@@ -17319,8 +17342,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    (cr->bssid &&
-	     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cr->bssid)) ||
+	    (connected_addr &&
+	     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr)) ||
 	    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
 			cr->status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
 			cr->status) ||
@@ -17346,6 +17369,38 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
 		goto nla_put_failure;
 
+	if (cr->valid_links) {
+		int i = 1;
+		struct nlattr *nested;
+
+		nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
+		if (!nested)
+			goto nla_put_failure;
+
+		for_each_valid_link(cr, link) {
+			struct nlattr *nested_mlo_links;
+			const u8 *bssid = cr->links[link].bss ?
+					  cr->links[link].bss->bssid :
+					  cr->links[link].bssid;
+
+			nested_mlo_links = nla_nest_start(msg, i);
+			if (!nested_mlo_links)
+				goto nla_put_failure;
+
+			if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
+			    (bssid &&
+			     nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
+			    (cr->links[link].addr &&
+			     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
+				     cr->links[link].addr)))
+				goto nla_put_failure;
+
+			nla_nest_end(msg, nested_mlo_links);
+			i++;
+		}
+		nla_nest_end(msg, nested);
+	}
+
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
@@ -17362,11 +17417,32 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 {
 	struct sk_buff *msg;
 	void *hdr;
-	const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
+	size_t link_info_size = 0;
+	unsigned int link;
+	const u8 *connected_addr = info->ap_mld_addr ?
+				   info->ap_mld_addr :
+				   (info->links[0].bss ?
+				    info->links[0].bss->bssid :
+				    info->links[0].bssid);
+
+	if (info->valid_links) {
+		for_each_valid_link(info, link) {
+			/* Nested attribute header */
+			link_info_size += NLA_HDRLEN;
+			/* Link ID */
+			link_info_size += nla_total_size(sizeof(u8));
+			link_info_size += info->links[link].addr ?
+					  nla_total_size(ETH_ALEN) : 0;
+			link_info_size += (info->links[link].bssid ||
+					   info->links[link].bss) ?
+					  nla_total_size(ETH_ALEN) : 0;
+		}
+	}
 
 	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
 			info->fils.kek_len + info->fils.pmk_len +
-			(info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			(info->fils.pmkid ? WLAN_PMKID_LEN : 0) +
+			link_info_size, gfp);
 	if (!msg)
 		return;
 
@@ -17378,7 +17454,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
-	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid) ||
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr) ||
 	    (info->req_ie &&
 	     nla_put(msg, NL80211_ATTR_REQ_IE, info->req_ie_len,
 		     info->req_ie)) ||
@@ -17397,6 +17473,38 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	     nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
 		goto nla_put_failure;
 
+	if (info->valid_links) {
+		int i = 1;
+		struct nlattr *nested;
+
+		nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
+		if (!nested)
+			goto nla_put_failure;
+
+		for_each_valid_link(info, link) {
+			struct nlattr *nested_mlo_links;
+			const u8 *bssid = info->links[link].bss ?
+					  info->links[link].bss->bssid :
+					  info->links[link].bssid;
+
+			nested_mlo_links = nla_nest_start(msg, i);
+			if (!nested_mlo_links)
+				goto nla_put_failure;
+
+			if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
+			    (bssid &&
+			     nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
+			    (info->links[link].addr &&
+			     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
+				     info->links[link].addr)))
+				goto nla_put_failure;
+
+			nla_nest_end(msg, nested_mlo_links);
+			i++;
+		}
+		nla_nest_end(msg, nested);
+	}
+
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index b3c6ce4f85ee..00be498aab2e 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -275,7 +275,7 @@ void cfg80211_conn_work(struct work_struct *work)
 
 			memset(&cr, 0, sizeof(cr));
 			cr.status = -1;
-			cr.bssid = bssid;
+			cr.links[0].bssid = bssid;
 			cr.timeout_reason = treason;
 			__cfg80211_connect_result(wdev->netdev, &cr, false);
 		}
@@ -384,7 +384,7 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
 
 		memset(&cr, 0, sizeof(cr));
 		cr.status = status_code;
-		cr.bssid = mgmt->bssid;
+		cr.links[0].bssid = mgmt->bssid;
 		cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED;
 		__cfg80211_connect_result(wdev->netdev, &cr, false);
 	} else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
@@ -698,6 +698,20 @@ static void disconnect_work(struct work_struct *work)
 
 DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
 
+static void
+cfg80211_connect_result_release_bsses(struct wireless_dev *wdev,
+				      struct cfg80211_connect_resp_params *cr)
+{
+	unsigned int link;
+
+	for_each_valid_link(cr, link) {
+		if (!cr->links[link].bss)
+			continue;
+		cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss));
+		cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
+	}
+}
+
 /*
  * API calls for drivers implementing connect/disconnect and
  * SME event handling
@@ -715,21 +729,33 @@ void __cfg80211_connect_result(struct net_device *dev,
 #ifdef CONFIG_CFG80211_WEXT
 	union iwreq_data wrqu;
 #endif
+	unsigned int link;
+	const u8 *connected_addr;
+	bool bss_not_found = false;
 
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
-		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) {
-		cfg80211_put_bss(wdev->wiphy, cr->bss);
-		return;
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		goto out;
+
+	if (cr->valid_links) {
+		if (WARN_ON(!cr->ap_mld_addr))
+			goto out;
+
+		for_each_valid_link(cr, link) {
+			if (WARN_ON(!cr->links[link].addr))
+				goto out;
+		}
 	}
 
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr,
 				    GFP_KERNEL);
+	connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid;
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (wextev) {
+	if (wextev && !cr->valid_links) {
 		if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) {
 			memset(&wrqu, 0, sizeof(wrqu));
 			wrqu.data.length = cr->req_ie_len;
@@ -746,23 +772,38 @@ void __cfg80211_connect_result(struct net_device *dev,
 
 		memset(&wrqu, 0, sizeof(wrqu));
 		wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-		if (cr->bssid && cr->status == WLAN_STATUS_SUCCESS) {
-			memcpy(wrqu.ap_addr.sa_data, cr->bssid, ETH_ALEN);
-			memcpy(wdev->wext.prev_bssid, cr->bssid, ETH_ALEN);
+		if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) {
+			memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN);
+			memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN);
 			wdev->wext.prev_bssid_valid = true;
 		}
 		wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
 	}
 #endif
 
-	if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) {
-		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
-		cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid,
-					   wdev->u.client.ssid, wdev->u.client.ssid_len,
-					   wdev->conn_bss_type,
-					   IEEE80211_PRIVACY_ANY);
-		if (cr->bss)
-			cfg80211_hold_bss(bss_from_pub(cr->bss));
+	if (cr->status == WLAN_STATUS_SUCCESS) {
+		for_each_valid_link(cr, link) {
+			if (WARN_ON_ONCE(!cr->links[link].bss))
+				break;
+		}
+
+		for_each_valid_link(cr, link) {
+			if (cr->links[link].bss)
+				continue;
+
+			cr->links[link].bss =
+				cfg80211_get_bss(wdev->wiphy, NULL,
+						 cr->links[link].bssid,
+						 wdev->u.client.ssid,
+						 wdev->u.client.ssid_len,
+						 wdev->conn_bss_type,
+						 IEEE80211_PRIVACY_ANY);
+			if (!cr->links[link].bss) {
+				bss_not_found = true;
+				break;
+			}
+			cfg80211_hold_bss(bss_from_pub(cr->links[link].bss));
+		}
 	}
 
 	cfg80211_wdev_release_bsses(wdev);
@@ -772,26 +813,40 @@ void __cfg80211_connect_result(struct net_device *dev,
 		wdev->connect_keys = NULL;
 		wdev->u.client.ssid_len = 0;
 		wdev->conn_owner_nlportid = 0;
-		if (cr->bss) {
-			cfg80211_unhold_bss(bss_from_pub(cr->bss));
-			cfg80211_put_bss(wdev->wiphy, cr->bss);
-		}
+		cfg80211_connect_result_release_bsses(wdev, cr);
 		cfg80211_sme_free(wdev);
 		return;
 	}
 
-	if (WARN_ON(!cr->bss))
+	if (WARN_ON(bss_not_found)) {
+		cfg80211_connect_result_release_bsses(wdev, cr);
 		return;
+	}
 
-	wdev->links[0].client.current_bss = bss_from_pub(cr->bss);
+	memset(wdev->links, 0, sizeof(wdev->links));
+	wdev->valid_links = cr->valid_links;
+	for_each_valid_link(cr, link)
+		wdev->links[link].client.current_bss =
+			bss_from_pub(cr->links[link].bss);
 	wdev->connected = true;
-	ether_addr_copy(wdev->u.client.connected_addr, cr->bss->bssid);
+	ether_addr_copy(wdev->u.client.connected_addr, connected_addr);
+	if (cr->valid_links) {
+		for_each_valid_link(cr, link)
+			memcpy(wdev->links[link].addr, cr->links[link].addr,
+			       ETH_ALEN);
+	}
 
 	if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP))
 		cfg80211_upload_connect_keys(wdev);
 
 	rcu_read_lock();
-	country_elem = ieee80211_bss_get_elem(cr->bss, WLAN_EID_COUNTRY);
+	for_each_valid_link(cr, link) {
+		country_elem =
+			ieee80211_bss_get_elem(cr->links[link].bss,
+					       WLAN_EID_COUNTRY);
+		if (country_elem)
+			break;
+	}
 	if (!country_elem) {
 		rcu_read_unlock();
 		return;
@@ -804,12 +859,60 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (!country_data)
 		return;
 
-	regulatory_hint_country_ie(wdev->wiphy, cr->bss->channel->band,
+	regulatory_hint_country_ie(wdev->wiphy,
+				   cr->links[link].bss->channel->band,
 				   country_data, country_datalen);
 	kfree(country_data);
+
+	return;
+out:
+	for_each_valid_link(cr, link)
+		cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
 }
 
-/* Consumes bss object one way or another */
+static void cfg80211_update_link_bss(struct wireless_dev *wdev,
+				     struct cfg80211_bss **bss)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct cfg80211_internal_bss *ibss;
+
+	if (!*bss)
+		return;
+
+	ibss = bss_from_pub(*bss);
+	if (list_empty(&ibss->list)) {
+		struct cfg80211_bss *found = NULL, *tmp = *bss;
+
+		found = cfg80211_get_bss(wdev->wiphy, NULL,
+					 (*bss)->bssid,
+					 wdev->u.client.ssid,
+					 wdev->u.client.ssid_len,
+					 wdev->conn_bss_type,
+					 IEEE80211_PRIVACY_ANY);
+		if (found) {
+			/* The same BSS is already updated so use it
+			 * instead, as it has latest info.
+			 */
+			*bss = found;
+		} else {
+			/* Update with BSS provided by driver, it will
+			 * be freshly added and ref cnted, we can free
+			 * the old one.
+			 *
+			 * signal_valid can be false, as we are not
+			 * expecting the BSS to be found.
+			 *
+			 * keep the old timestamp to avoid confusion
+			 */
+			cfg80211_bss_update(rdev, ibss, false,
+					    ibss->ts);
+		}
+
+		cfg80211_put_bss(wdev->wiphy, tmp);
+	}
+}
+
+/* Consumes bss object(s) one way or another */
 void cfg80211_connect_done(struct net_device *dev,
 			   struct cfg80211_connect_resp_params *params,
 			   gfp_t gfp)
@@ -819,55 +922,34 @@ void cfg80211_connect_done(struct net_device *dev,
 	struct cfg80211_event *ev;
 	unsigned long flags;
 	u8 *next;
+	size_t link_info_size = 0;
+	unsigned int link;
 
-	if (params->bss) {
-		struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss);
-
-		if (list_empty(&ibss->list)) {
-			struct cfg80211_bss *found = NULL, *tmp = params->bss;
-
-			found = cfg80211_get_bss(wdev->wiphy, NULL,
-						 params->bss->bssid,
-						 wdev->u.client.ssid, wdev->u.client.ssid_len,
-						 wdev->conn_bss_type,
-						 IEEE80211_PRIVACY_ANY);
-			if (found) {
-				/* The same BSS is already updated so use it
-				 * instead, as it has latest info.
-				 */
-				params->bss = found;
-			} else {
-				/* Update with BSS provided by driver, it will
-				 * be freshly added and ref cnted, we can free
-				 * the old one.
-				 *
-				 * signal_valid can be false, as we are not
-				 * expecting the BSS to be found.
-				 *
-				 * keep the old timestamp to avoid confusion
-				 */
-				cfg80211_bss_update(rdev, ibss, false,
-						    ibss->ts);
-			}
-
-			cfg80211_put_bss(wdev->wiphy, tmp);
-		}
+	for_each_valid_link(params, link) {
+		cfg80211_update_link_bss(wdev, &params->links[link].bss);
+		link_info_size += params->links[link].bssid ? ETH_ALEN : 0;
+		link_info_size += params->links[link].addr ? ETH_ALEN : 0;
 	}
 
-	ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
+	ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) +
 		     params->req_ie_len + params->resp_ie_len +
 		     params->fils.kek_len + params->fils.pmk_len +
-		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
+		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size,
+		     gfp);
+
 	if (!ev) {
-		cfg80211_put_bss(wdev->wiphy, params->bss);
+		for_each_valid_link(params, link)
+			cfg80211_put_bss(wdev->wiphy,
+					 params->links[link].bss);
 		return;
 	}
 
 	ev->type = EVENT_CONNECT_RESULT;
 	next = ((u8 *)ev) + sizeof(*ev);
-	if (params->bssid) {
-		ev->cr.bssid = next;
-		memcpy((void *)ev->cr.bssid, params->bssid, ETH_ALEN);
+	if (params->ap_mld_addr) {
+		ev->cr.ap_mld_addr = next;
+		memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr,
+		       ETH_ALEN);
 		next += ETH_ALEN;
 	}
 	if (params->req_ie_len) {
@@ -907,9 +989,28 @@ void cfg80211_connect_done(struct net_device *dev,
 	ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
 	if (params->fils.update_erp_next_seq_num)
 		ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
-	if (params->bss)
-		cfg80211_hold_bss(bss_from_pub(params->bss));
-	ev->cr.bss = params->bss;
+	ev->cr.valid_links = params->valid_links;
+	for_each_valid_link(params, link) {
+		if (params->links[link].bss)
+			cfg80211_hold_bss(
+				bss_from_pub(params->links[link].bss));
+		ev->cr.links[link].bss = params->links[link].bss;
+
+		if (params->links[link].addr) {
+			ev->cr.links[link].addr = next;
+			memcpy((void *)ev->cr.links[link].addr,
+			       params->links[link].addr,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+		if (params->links[link].bssid) {
+			ev->cr.links[link].bssid = next;
+			memcpy((void *)ev->cr.links[link].bssid,
+			       params->links[link].bssid,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+	}
 	ev->cr.status = params->status;
 	ev->cr.timeout_reason = params->timeout_reason;
 
@@ -927,6 +1028,9 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 #ifdef CONFIG_CFG80211_WEXT
 	union iwreq_data wrqu;
 #endif
+	unsigned int link;
+	const u8 *connected_addr;
+
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
@@ -936,48 +1040,76 @@ void __cfg80211_roamed(struct wireless_dev *wdev,
 	if (WARN_ON(!wdev->connected))
 		goto out;
 
+	if (info->valid_links) {
+		if (WARN_ON(!info->ap_mld_addr))
+			goto out;
+
+		for_each_valid_link(info, link) {
+			if (WARN_ON(!info->links[link].addr))
+				goto out;
+		}
+	}
+
 	cfg80211_wdev_release_bsses(wdev);
 
-	if (WARN_ON(!info->bss))
-		return;
+	for_each_valid_link(info, link) {
+		if (WARN_ON(!info->links[link].bss))
+			goto out;
+	}
 
-	cfg80211_hold_bss(bss_from_pub(info->bss));
-	wdev->links[0].client.current_bss = bss_from_pub(info->bss);
-	ether_addr_copy(wdev->u.client.connected_addr, info->bss->bssid);
+	memset(wdev->links, 0, sizeof(wdev->links));
+	wdev->valid_links = info->valid_links;
+	for_each_valid_link(info, link) {
+		cfg80211_hold_bss(bss_from_pub(info->links[link].bss));
+		wdev->links[link].client.current_bss =
+			bss_from_pub(info->links[link].bss);
+	}
 
+	connected_addr = info->valid_links ?
+			 info->ap_mld_addr :
+			 info->links[0].bss->bssid;
+	ether_addr_copy(wdev->u.client.connected_addr, connected_addr);
+	if (info->valid_links) {
+		for_each_valid_link(info, link)
+			memcpy(wdev->links[link].addr, info->links[link].addr,
+			       ETH_ALEN);
+	}
 	wdev->unprot_beacon_reported = 0;
 	nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
 			    wdev->netdev, info, GFP_KERNEL);
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (info->req_ie) {
-		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = info->req_ie_len;
-		wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
-				    &wrqu, info->req_ie);
-	}
+	if (!info->valid_links) {
+		if (info->req_ie) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = info->req_ie_len;
+			wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
+					    &wrqu, info->req_ie);
+		}
+
+		if (info->resp_ie) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = info->resp_ie_len;
+			wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
+					    &wrqu, info->resp_ie);
+		}
 
-	if (info->resp_ie) {
 		memset(&wrqu, 0, sizeof(wrqu));
-		wrqu.data.length = info->resp_ie_len;
-		wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
-				    &wrqu, info->resp_ie);
+		wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+		memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN);
+		memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN);
+		wdev->wext.prev_bssid_valid = true;
+		wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
 	}
-
-	memset(&wrqu, 0, sizeof(wrqu));
-	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
-	memcpy(wrqu.ap_addr.sa_data, info->bss->bssid, ETH_ALEN);
-	memcpy(wdev->wext.prev_bssid, info->bss->bssid, ETH_ALEN);
-	wdev->wext.prev_bssid_valid = true;
-	wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
 #endif
 
 	return;
 out:
-	cfg80211_put_bss(wdev->wiphy, info->bss);
+	for_each_valid_link(info, link)
+		cfg80211_put_bss(wdev->wiphy, info->links[link].bss);
 }
 
-/* Consumes info->bss object one way or another */
+/* Consumes info->links.bss object(s) one way or another */
 void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 		     gfp_t gfp)
 {
@@ -986,25 +1118,41 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	struct cfg80211_event *ev;
 	unsigned long flags;
 	u8 *next;
+	unsigned int link;
+	size_t link_info_size = 0;
+	bool bss_not_found = false;
+
+	for_each_valid_link(info, link) {
+		link_info_size += info->links[link].addr ? ETH_ALEN : 0;
+		link_info_size += info->links[link].bssid ? ETH_ALEN : 0;
 
-	if (!info->bss) {
-		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
-					     info->bssid, wdev->u.client.ssid,
-					     wdev->u.client.ssid_len,
-					     wdev->conn_bss_type,
-					     IEEE80211_PRIVACY_ANY);
+		if (info->links[link].bss)
+			continue;
+
+		info->links[link].bss =
+			cfg80211_get_bss(wdev->wiphy,
+					 info->links[link].channel,
+					 info->links[link].bssid,
+					 wdev->u.client.ssid,
+					 wdev->u.client.ssid_len,
+					 wdev->conn_bss_type,
+					 IEEE80211_PRIVACY_ANY);
+
+		if (!info->links[link].bss) {
+			bss_not_found = true;
+			break;
+		}
 	}
 
-	if (WARN_ON(!info->bss))
-		return;
+	if (WARN_ON(bss_not_found))
+		goto out;
 
 	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
 		     info->fils.kek_len + info->fils.pmk_len +
-		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
-	if (!ev) {
-		cfg80211_put_bss(wdev->wiphy, info->bss);
-		return;
-	}
+		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0) +
+		     (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp);
+	if (!ev)
+		goto out;
 
 	ev->type = EVENT_ROAMED;
 	next = ((u8 *)ev) + sizeof(*ev);
@@ -1044,12 +1192,43 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
 	if (info->fils.update_erp_next_seq_num)
 		ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
-	ev->rm.bss = info->bss;
+	if (info->ap_mld_addr) {
+		ev->rm.ap_mld_addr = next;
+		memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr,
+		       ETH_ALEN);
+		next += ETH_ALEN;
+	}
+	ev->rm.valid_links = info->valid_links;
+	for_each_valid_link(info, link) {
+		ev->rm.links[link].bss = info->links[link].bss;
+
+		if (info->links[link].addr) {
+			ev->rm.links[link].addr = next;
+			memcpy((void *)ev->rm.links[link].addr,
+			       info->links[link].addr,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+
+		if (info->links[link].bssid) {
+			ev->rm.links[link].bssid = next;
+			memcpy((void *)ev->rm.links[link].bssid,
+			       info->links[link].bssid,
+			       ETH_ALEN);
+			next += ETH_ALEN;
+		}
+	}
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
 	list_add_tail(&ev->list, &wdev->event_list);
 	spin_unlock_irqrestore(&wdev->event_lock, flags);
 	queue_work(cfg80211_wq, &rdev->event_work);
+
+	return;
+out:
+	for_each_valid_link(info, link)
+		cfg80211_put_bss(wdev->wiphy, info->links[link].bss);
+
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
-- 
cgit v1.2.3


From e6445be4f549a0342cd9b8672b82e3b1d85d017f Mon Sep 17 00:00:00 2001
From: Liu Ying <victor.liu@nxp.com>
Date: Sat, 11 Jun 2022 22:14:08 +0800
Subject: media: uapi: Add some RGB bus formats for i.MX8qm/qxp pixel combiner

This patch adds RGB666_1X30_CPADLO, RGB888_1X30_CPADLO, RGB666_1X36_CPADLO
and RGB888_1X36_CPADLO bus formats used by i.MX8qm/qxp pixel combiner.
The RGB pixels with padding low per component are transmitted on a 30-bit
input bus(10-bit per component) from a display controller or a 36-bit
output bus(12-bit per component) to a pixel link.

Reviewed-by: Robert Foss <robert.foss@linaro.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Liu Ying <victor.liu@nxp.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Robert Foss <robert.foss@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20220611141421.718743-2-victor.liu@nxp.com
---
 include/uapi/linux/media-bus-format.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index 0dfc11ee243a..ec3323dbb927 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -34,7 +34,7 @@
 
 #define MEDIA_BUS_FMT_FIXED			0x0001
 
-/* RGB - next is	0x101e */
+/* RGB - next is	0x1022 */
 #define MEDIA_BUS_FMT_RGB444_1X12		0x1016
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE	0x1001
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE	0x1002
@@ -59,9 +59,13 @@
 #define MEDIA_BUS_FMT_RGB888_3X8_DELTA		0x101d
 #define MEDIA_BUS_FMT_RGB888_1X7X4_SPWG		0x1011
 #define MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA	0x1012
+#define MEDIA_BUS_FMT_RGB666_1X30_CPADLO	0x101e
+#define MEDIA_BUS_FMT_RGB888_1X30_CPADLO	0x101f
 #define MEDIA_BUS_FMT_ARGB8888_1X32		0x100d
 #define MEDIA_BUS_FMT_RGB888_1X32_PADHI		0x100f
 #define MEDIA_BUS_FMT_RGB101010_1X30		0x1018
+#define MEDIA_BUS_FMT_RGB666_1X36_CPADLO	0x1020
+#define MEDIA_BUS_FMT_RGB888_1X36_CPADLO	0x1021
 #define MEDIA_BUS_FMT_RGB121212_1X36		0x1019
 #define MEDIA_BUS_FMT_RGB161616_1X48		0x101a
 
-- 
cgit v1.2.3


From 084cc29f8bbb034cf30a7ee07a816c115e0c28df Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Mon, 13 Jun 2022 21:25:21 +0000
Subject: KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis

In some cases, the NX hugepage mitigation for iTLB multihit is not
needed for all guests on a host. Allow disabling the mitigation on a
per-VM basis to avoid the performance hit of NX hugepages on trusted
workloads.

In order to disable NX hugepages on a VM, ensure that the userspace
actor has permission to reboot the system. Since disabling NX hugepages
would allow a guest to crash the system, it is similar to reboot
permissions.

Ideally, KVM would require userspace to prove it has access to KVM's
nx_huge_pages module param, e.g. so that userspace can opt out without
needing full reboot permissions.  But getting access to the module param
file info is difficult because it is buried in layers of sysfs and module
glue. Requiring CAP_SYS_BOOT is sufficient for all known use cases.

Suggested-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20220613212523.3436117-9-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 16 ++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu_internal.h |  7 ++++---
 arch/x86/kvm/mmu/spte.c         |  7 ++++---
 arch/x86/kvm/mmu/spte.h         |  3 ++-
 arch/x86/kvm/mmu/tdp_mmu.c      |  2 +-
 arch/x86/kvm/x86.c              | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  1 +
 8 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 320cb04f7bd9..bafaeedd455c 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the
 dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is
 available and supports the `KVM_PV_DUMP_CPU` subcommand.
 
+8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+---------------------------
+
+:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
+:Architectures: x86
+:Type: vm
+:Parameters: arg[0] must be 0.
+:Returns 0 on success, -EPERM if the userspace process does not
+	 have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been
+	 created.
+
+This capability disables the NX huge pages mitigation for iTLB MULTIHIT.
+
+The capability has no effect if the nx_huge_pages module parameter is not set.
+
+This capability may only be set before any vCPUs are created.
 
 9. Known KVM API problems
 =========================
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e37727a74d0a..7e4c31b57a75 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1336,6 +1336,8 @@ struct kvm_arch {
 	 * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
 	 */
 	u32 max_vcpu_ids;
+
+	bool disable_nx_huge_pages;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5e1e3c8f8aaa..bb9d12ac0db3 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
 
 extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
 {
-	return READ_ONCE(nx_huge_pages);
+	return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
 }
 
 struct kvm_page_fault {
@@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.user = err & PFERR_USER_MASK,
 		.prefetch = prefetch,
 		.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
-		.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+		.nx_huge_page_workaround_enabled =
+			is_nx_huge_page_enabled(vcpu->kvm),
 
 		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
 		.req_level = PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 242e4828d7df..db294c1beea2 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		spte |= spte_shadow_accessed_mask(spte);
 
 	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
-	    is_nx_huge_page_enabled()) {
+	    is_nx_huge_page_enabled(vcpu->kvm)) {
 		pte_access &= ~ACC_EXEC_MASK;
 	}
 
@@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte)
  * This is used during huge page splitting to build the SPTEs that make up the
  * new page table.
  */
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+			      int index)
 {
 	u64 child_spte;
 	int child_level;
@@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
 		 * When splitting to a 4K page, mark the page executable as the
 		 * NX hugepage mitigation no longer applies.
 		 */
-		if (is_nx_huge_page_enabled())
+		if (is_nx_huge_page_enabled(kvm))
 			child_spte = make_spte_executable(child_spte);
 	}
 
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 121c5eaaec77..256f90587e8d 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
 	       u64 old_spte, bool prefetch, bool can_unsync,
 	       bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level,
+			      int index);
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
 u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1ea40809ef1f..522e2532343b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
 	 * not been linked in yet and thus is not reachable from any other CPU.
 	 */
 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
-		sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
+		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i);
 
 	/*
 	 * Replace the huge spte with a pointer to the populated lower level
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1b3b2ea8ee0..7ce0c6fe166d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SYS_ATTRIBUTES:
 	case KVM_CAP_VAPIC:
 	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
@@ -6184,6 +6185,35 @@ split_irqchip_unlock:
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+		r = -EINVAL;
+
+		/*
+		 * Since the risk of disabling NX hugepages is a guest crashing
+		 * the system, ensure the userspace process has permission to
+		 * reboot the system.
+		 *
+		 * Note that unlike the reboot() syscall, the process must have
+		 * this capability in the root namespace because exposing
+		 * /dev/kvm into a container does not limit the scope of the
+		 * iTLB multihit bug to that container. In other words,
+		 * this must use capable(), not ns_capable().
+		 */
+		if (!capable(CAP_SYS_BOOT)) {
+			r = -EPERM;
+			break;
+		}
+
+		if (cap->args[0])
+			break;
+
+		mutex_lock(&kvm->lock);
+		if (!kvm->created_vcpus) {
+			kvm->arch.disable_nx_huge_pages = true;
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7569b4ec199c..a36e78710382 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED_DUMP 217
 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
 #define KVM_CAP_X86_NOTIFY_VMEXIT 219
+#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 0a2ff7cc8ad48a86939a91bd3457f38e59e741a1 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 21 Jun 2022 15:49:19 +0800
Subject: Bonding: add per-port priority for failover re-selection

Add per port priority support for bonding active slave re-selection during
failover. A higher number means higher priority in selection. The primary
slave still has the highest priority. This option also follows the
primary_reselect rules.

This option could only be configured via netlink.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.rst | 11 +++++++++++
 drivers/net/bonding/bond_main.c      | 27 +++++++++++++++++++++++++++
 drivers/net/bonding/bond_netlink.c   | 15 +++++++++++++++
 drivers/net/bonding/bond_options.c   | 33 +++++++++++++++++++++++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  1 +
 include/uapi/linux/if_link.h         |  1 +
 tools/include/uapi/linux/if_link.h   |  1 +
 8 files changed, 90 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index 43be3782e5df..53a18ff7cf23 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -780,6 +780,17 @@ peer_notif_delay
 	value is 0 which means to match the value of the link monitor
 	interval.
 
+prio
+	Slave priority. A higher number means higher priority.
+	The primary slave has the highest priority. This option also
+	follows the primary_reselect rules.
+
+	This option could only be configured via netlink, and is only valid
+	for active-backup(1), balance-tlb (5) and balance-alb (6) mode.
+	The valid value range is a signed 32 bit integer.
+
+	The default value is 0.
+
 primary
 
 	A string (eth0, eth2, etc) specifying which slave is the
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index dc3e9a06e1aa..e75acb14d066 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1026,12 +1026,38 @@ out:
 
 }
 
+/**
+ * bond_choose_primary_or_current - select the primary or high priority slave
+ * @bond: our bonding struct
+ *
+ * - Check if there is a primary link. If the primary link was set and is up,
+ *   go on and do link reselection.
+ *
+ * - If primary link is not set or down, find the highest priority link.
+ *   If the highest priority link is not current slave, set it as primary
+ *   link and do link reselection.
+ */
 static struct slave *bond_choose_primary_or_current(struct bonding *bond)
 {
 	struct slave *prim = rtnl_dereference(bond->primary_slave);
 	struct slave *curr = rtnl_dereference(bond->curr_active_slave);
+	struct slave *slave, *hprio = NULL;
+	struct list_head *iter;
 
 	if (!prim || prim->link != BOND_LINK_UP) {
+		bond_for_each_slave(bond, slave, iter) {
+			if (slave->link == BOND_LINK_UP) {
+				hprio = hprio ?: slave;
+				if (slave->prio > hprio->prio)
+					hprio = slave;
+			}
+		}
+
+		if (hprio && hprio != curr) {
+			prim = hprio;
+			goto link_reselect;
+		}
+
 		if (!curr || curr->link != BOND_LINK_UP)
 			return NULL;
 		return curr;
@@ -1042,6 +1068,7 @@ static struct slave *bond_choose_primary_or_current(struct bonding *bond)
 		return prim;
 	}
 
+link_reselect:
 	if (!curr || curr->link != BOND_LINK_UP)
 		return prim;
 
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 5a6f44455b95..c2d080fc4fc4 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -27,6 +27,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev,
 		nla_total_size(sizeof(u16)) +	/* IFLA_BOND_SLAVE_AD_AGGREGATOR_ID */
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */
 		nla_total_size(sizeof(u16)) +	/* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */
+		nla_total_size(sizeof(s32)) +	/* IFLA_BOND_SLAVE_PRIO */
 		0;
 }
 
@@ -53,6 +54,9 @@ static int bond_fill_slave_info(struct sk_buff *skb,
 	if (nla_put_u16(skb, IFLA_BOND_SLAVE_QUEUE_ID, slave->queue_id))
 		goto nla_put_failure;
 
+	if (nla_put_s32(skb, IFLA_BOND_SLAVE_PRIO, slave->prio))
+		goto nla_put_failure;
+
 	if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) {
 		const struct aggregator *agg;
 		const struct port *ad_port;
@@ -117,6 +121,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
 	[IFLA_BOND_SLAVE_QUEUE_ID]	= { .type = NLA_U16 },
+	[IFLA_BOND_SLAVE_PRIO]		= { .type = NLA_S32 },
 };
 
 static int bond_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -157,6 +162,16 @@ static int bond_slave_changelink(struct net_device *bond_dev,
 			return err;
 	}
 
+	if (data[IFLA_BOND_SLAVE_PRIO]) {
+		int prio = nla_get_s32(data[IFLA_BOND_SLAVE_PRIO]);
+
+		bond_opt_slave_initval(&newval, &slave_dev, prio);
+		err = __bond_opt_set(bond, BOND_OPT_PRIO, &newval,
+				     data[IFLA_BOND_SLAVE_PRIO], extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 96eef19cffc4..3498db1c1b3c 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -40,6 +40,8 @@ static int bond_option_arp_validate_set(struct bonding *bond,
 					const struct bond_opt_value *newval);
 static int bond_option_arp_all_targets_set(struct bonding *bond,
 					   const struct bond_opt_value *newval);
+static int bond_option_prio_set(struct bonding *bond,
+				const struct bond_opt_value *newval);
 static int bond_option_primary_set(struct bonding *bond,
 				   const struct bond_opt_value *newval);
 static int bond_option_primary_reselect_set(struct bonding *bond,
@@ -365,6 +367,16 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.values = bond_intmax_tbl,
 		.set = bond_option_miimon_set
 	},
+	[BOND_OPT_PRIO] = {
+		.id = BOND_OPT_PRIO,
+		.name = "prio",
+		.desc = "Link priority for failover re-selection",
+		.flags = BOND_OPTFLAG_RAWVAL,
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) |
+						BIT(BOND_MODE_TLB) |
+						BIT(BOND_MODE_ALB)),
+		.set = bond_option_prio_set
+	},
 	[BOND_OPT_PRIMARY] = {
 		.id = BOND_OPT_PRIMARY,
 		.name = "primary",
@@ -1306,6 +1318,27 @@ static int bond_option_missed_max_set(struct bonding *bond,
 	return 0;
 }
 
+static int bond_option_prio_set(struct bonding *bond,
+				const struct bond_opt_value *newval)
+{
+	struct slave *slave;
+
+	slave = bond_slave_get_rtnl(newval->slave_dev);
+	if (!slave) {
+		netdev_dbg(newval->slave_dev, "%s called on NULL slave\n", __func__);
+		return -ENODEV;
+	}
+	slave->prio = newval->value;
+
+	if (rtnl_dereference(bond->primary_slave))
+		slave_warn(bond->dev, slave->dev,
+			   "prio updated, but will not affect failover re-selection as primary slave have been set\n");
+	else
+		bond_select_active_slave(bond);
+
+	return 0;
+}
+
 static int bond_option_primary_set(struct bonding *bond,
 				   const struct bond_opt_value *newval)
 {
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index eade8236a4df..d2aea5cf1e41 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -67,6 +67,7 @@ enum {
 	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_MISSED_MAX,
 	BOND_OPT_NS_TARGETS,
+	BOND_OPT_PRIO,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index cb904d356e31..6e78d657aa05 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -178,6 +178,7 @@ struct slave {
 	u32    speed;
 	u16    queue_id;
 	u8     perm_hwaddr[MAX_ADDR_LEN];
+	int    prio;
 	struct ad_slave_info *ad_info;
 	struct tlb_slave_info tlb_info;
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5f58dcfe2787..e36d9d2c65a7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -963,6 +963,7 @@ enum {
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
 	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
 	IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
+	IFLA_BOND_SLAVE_PRIO,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index b339bf2196ca..0242f31e339c 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -890,6 +890,7 @@ enum {
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
 	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
 	IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
+	IFLA_BOND_SLAVE_PRIO,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
-- 
cgit v1.2.3


From a08d6a6dc82036cbd889fe3d53f9c69dc13436eb Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Fri, 24 Jun 2022 16:39:48 +0200
Subject: net: dsa: add Renesas RZ/N1 switch tag driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The switch that is present on the Renesas RZ/N1 SoC uses a specific
VLAN value followed by 6 bytes which contains forwarding configuration.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h             |   2 +
 include/uapi/linux/if_ether.h |   1 +
 net/dsa/Kconfig               |   7 +++
 net/dsa/Makefile              |   1 +
 net/dsa/tag_rzn1_a5psw.c      | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 124 insertions(+)
 create mode 100644 net/dsa/tag_rzn1_a5psw.c

(limited to 'include/uapi/linux')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 64da5ed27fdc..33283eeda697 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -53,6 +53,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_SJA1110_VALUE		23
 #define DSA_TAG_PROTO_RTL8_4_VALUE		24
 #define DSA_TAG_PROTO_RTL8_4T_VALUE		25
+#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE		26
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -81,6 +82,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_SJA1110		= DSA_TAG_PROTO_SJA1110_VALUE,
 	DSA_TAG_PROTO_RTL8_4		= DSA_TAG_PROTO_RTL8_4_VALUE,
 	DSA_TAG_PROTO_RTL8_4T		= DSA_TAG_PROTO_RTL8_4T_VALUE,
+	DSA_TAG_PROTO_RZN1_A5PSW	= DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
 };
 
 struct dsa_switch;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 1d0bccc3fa54..d370165bc621 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -116,6 +116,7 @@
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_DSA_A5PSW	0xE001		/* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 8cb87b5067ee..63853fff4e2f 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -132,6 +132,13 @@ config NET_DSA_TAG_RTL8_4
 	  Say Y or M if you want to enable support for tagging frames for Realtek
 	  switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC.
 
+config NET_DSA_TAG_RZN1_A5PSW
+	tristate "Tag driver for Renesas RZ/N1 A5PSW switch"
+	help
+	  Say Y or M if you want to enable support for tagging frames for
+	  Renesas RZ/N1 embedded switch that uses an 8 byte tag located after
+	  destination MAC address.
+
 config NET_DSA_TAG_LAN9303
 	tristate "Tag driver for SMSC/Microchip LAN9303 family of switches"
 	help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9f75820e7c98..af28c24ead18 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
 obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
 obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
 obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o
+obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
 obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
 obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
 obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
new file mode 100644
index 000000000000..e2a5ee6ae688
--- /dev/null
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+/* To define the outgoing port and to discover the incoming port a TAG is
+ * inserted after Src MAC :
+ *
+ *       Dest MAC       Src MAC           TAG         Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ *                                |<--------------->|
+ *
+ * See struct a5psw_tag for layout
+ */
+
+#define ETH_P_DSA_A5PSW			0xE001
+#define A5PSW_TAG_LEN			8
+#define A5PSW_CTRL_DATA_FORCE_FORWARD	BIT(0)
+/* This is both used for xmit tag and rcv tagging */
+#define A5PSW_CTRL_DATA_PORT		GENMASK(3, 0)
+
+struct a5psw_tag {
+	__be16 ctrl_tag;
+	__be16 ctrl_data;
+	__be16 ctrl_data2_hi;
+	__be16 ctrl_data2_lo;
+};
+
+static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct a5psw_tag *ptag;
+	u32 data2_val;
+
+	BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN);
+
+	/* The Ethernet switch we are interfaced with needs packets to be at
+	 * least 60 bytes otherwise they will be discarded when they enter the
+	 * switch port logic.
+	 */
+	if (__skb_put_padto(skb, ETH_ZLEN, false))
+		return NULL;
+
+	/* provide 'A5PSW_TAG_LEN' bytes additional space */
+	skb_push(skb, A5PSW_TAG_LEN);
+
+	/* make room between MACs and Ether-Type to insert tag */
+	dsa_alloc_etype_header(skb, A5PSW_TAG_LEN);
+
+	ptag = dsa_etype_header_pos_tx(skb);
+
+	data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index));
+	ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW);
+	ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD);
+	ptag->ctrl_data2_lo = htons(data2_val);
+	ptag->ctrl_data2_hi = 0;
+
+	return skb;
+}
+
+static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	struct a5psw_tag *tag;
+	int port;
+
+	if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
+		dev_warn_ratelimited(&dev->dev,
+				     "Dropping packet, cannot pull\n");
+		return NULL;
+	}
+
+	tag = dsa_etype_header_pos_rx(skb);
+
+	if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
+		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+		return NULL;
+	}
+
+	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
+
+	skb->dev = dsa_master_find_slave(dev, 0, port);
+	if (!skb->dev)
+		return NULL;
+
+	skb_pull_rcsum(skb, A5PSW_TAG_LEN);
+	dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
+
+	dsa_default_offload_fwd_mark(skb);
+
+	return skb;
+}
+
+static const struct dsa_device_ops a5psw_netdev_ops = {
+	.name	= "a5psw",
+	.proto	= DSA_TAG_PROTO_RZN1_A5PSW,
+	.xmit	= a5psw_tag_xmit,
+	.rcv	= a5psw_tag_rcv,
+	.needed_headroom = A5PSW_TAG_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW);
+module_dsa_tag_driver(a5psw_netdev_ops);
-- 
cgit v1.2.3


From e23ee9d2c4ccb08fdfee3aea0a04a27bccd77433 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:21 +0300
Subject: serial: Use bits for UART_LSR_BRK_ERROR_BITS/MSR_ANY_DELTA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of listing the bits for UART_LSR_BRK_ERROR_BITS and
UART_MSR_ANY_DELTA in comment, use them to define instead.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_reg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h
index f51bc8f36813..bab3b39266cc 100644
--- a/include/uapi/linux/serial_reg.h
+++ b/include/uapi/linux/serial_reg.h
@@ -139,7 +139,7 @@
 #define UART_LSR_PE		0x04 /* Parity error indicator */
 #define UART_LSR_OE		0x02 /* Overrun error indicator */
 #define UART_LSR_DR		0x01 /* Receiver data ready */
-#define UART_LSR_BRK_ERROR_BITS	0x1E /* BI, FE, PE, OE bits */
+#define UART_LSR_BRK_ERROR_BITS	(UART_LSR_BI|UART_LSR_FE|UART_LSR_PE|UART_LSR_OE)
 
 #define UART_MSR	6	/* In:  Modem Status Register */
 #define UART_MSR_DCD		0x80 /* Data Carrier Detect */
@@ -150,7 +150,7 @@
 #define UART_MSR_TERI		0x04 /* Trailing edge ring indicator */
 #define UART_MSR_DDSR		0x02 /* Delta DSR */
 #define UART_MSR_DCTS		0x01 /* Delta CTS */
-#define UART_MSR_ANY_DELTA	0x0F /* Any of the delta bits! */
+#define UART_MSR_ANY_DELTA	(UART_MSR_DDCD|UART_MSR_TERI|UART_MSR_DDSR|UART_MSR_DCTS)
 
 #define UART_SCR	7	/* I/O: Scratch Register */
 
-- 
cgit v1.2.3


From 4f768e94774c58c9f7f54ebd38dadf172970046a Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:09 +0300
Subject: serial: Support for RS-485 multipoint addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for RS-485 multipoint addressing using 9th bit [*]. The
addressing mode is configured through ->rs485_config().

ADDRB in termios indicates 9th bit addressing mode is enabled. In this
mode, 9th bit is used to indicate an address (byte) within the
communication line. ADDRB can only be enabled/disabled through
->rs485_config() that is also responsible for setting the destination and
receiver (filter) addresses.

Add traps to detect unwanted changes to struct serial_rs485 layout using
static_assert().

[*] Technically, RS485 is just an electronic spec and does not itself
specify the 9th bit addressing mode but 9th bit seems at least
"semi-standard" way to do addressing with RS485.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-6-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst       |  2 ++
 Documentation/driver-api/serial/serial-rs485.rst | 26 +++++++++++++++++++++++-
 drivers/tty/serial/serial_core.c                 | 22 +++++++++++++++++++-
 drivers/tty/tty_ioctl.c                          |  4 ++++
 include/uapi/asm-generic/termbits-common.h       |  1 +
 include/uapi/linux/serial.h                      | 20 ++++++++++++++++--
 6 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index 1e7ab4142d49..ee1679858aa2 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -261,6 +261,8 @@ hardware.
 			- parity enable
 		PARODD
 			- odd parity (when PARENB is in force)
+		ADDRB
+			- address bit (changed through .rs485_config()).
 		CREAD
 			- enable reception of characters (if not set,
 			  still receive characters from the port, but
diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
index 00b5d333acba..6ebad75c74ed 100644
--- a/Documentation/driver-api/serial/serial-rs485.rst
+++ b/Documentation/driver-api/serial/serial-rs485.rst
@@ -99,7 +99,31 @@ RS485 Serial Communications
 		/* Error handling. See errno. */
 	}
 
-5. References
+5. Multipoint Addressing
+========================
+
+   The Linux kernel provides addressing mode for multipoint RS-485 serial
+   communications line. The addressing mode is enabled with SER_RS485_ADDRB
+   flag in serial_rs485. Struct serial_rs485 has two additional flags and
+   fields for enabling receive and destination addresses.
+
+   Address mode flags:
+	- SER_RS485_ADDRB: Enabled addressing mode (sets also ADDRB in termios).
+	- SER_RS485_ADDR_RECV: Receive (filter) address enabled.
+	- SER_RS485_ADDR_DEST: Set destination address.
+
+   Address fields (enabled with corresponding SER_RS485_ADDR_* flag):
+	- addr_recv: Receive address.
+	- addr_dest: Destination address.
+
+   Once a receive address is set, the communication can occur only with the
+   particular device and other peers are filtered out. It is left up to the
+   receiver side to enforce the filtering. Receive address will be cleared
+   if SER_RS485_ADDR_RECV is not set.
+
+   Note: not all devices supporting RS485 support multipoint addressing.
+
+6. References
 =============
 
  [1]	include/uapi/linux/serial.h
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2529153c8979..85ef7ef00b82 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1288,6 +1288,17 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 	if (flags & ~port->rs485_supported->flags)
 		return -EINVAL;
 
+	/* Asking for address w/o addressing mode? */
+	if (!(rs485->flags & SER_RS485_ADDRB) &&
+	    (rs485->flags & (SER_RS485_ADDR_RECV|SER_RS485_ADDR_DEST)))
+		return -EINVAL;
+
+	/* Address given but not enabled? */
+	if (!(rs485->flags & SER_RS485_ADDR_RECV) && rs485->addr_recv)
+		return -EINVAL;
+	if (!(rs485->flags & SER_RS485_ADDR_DEST) && rs485->addr_dest)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -1343,7 +1354,8 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 	rs485->flags &= supported_flags;
 
 	/* Return clean padding area to userspace */
-	memset(rs485->padding, 0, sizeof(rs485->padding));
+	memset(rs485->padding0, 0, sizeof(rs485->padding0));
+	memset(rs485->padding1, 0, sizeof(rs485->padding1));
 }
 
 int uart_rs485_config(struct uart_port *port)
@@ -3402,5 +3414,13 @@ int uart_get_rs485_mode(struct uart_port *port)
 }
 EXPORT_SYMBOL_GPL(uart_get_rs485_mode);
 
+/* Compile-time assertions for serial_rs485 layout */
+static_assert(offsetof(struct serial_rs485, padding) ==
+              (offsetof(struct serial_rs485, delay_rts_after_send) + sizeof(__u32)));
+static_assert(offsetof(struct serial_rs485, padding1) ==
+	      offsetof(struct serial_rs485, padding[1]));
+static_assert((offsetof(struct serial_rs485, padding[4]) + sizeof(__u32)) ==
+	      sizeof(struct serial_rs485));
+
 MODULE_DESCRIPTION("Serial driver core");
 MODULE_LICENSE("GPL");
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index adae687f654b..2a76b330e108 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -319,6 +319,8 @@ unsigned char tty_get_frame_size(unsigned int cflag)
 		bits++;
 	if (cflag & PARENB)
 		bits++;
+	if (cflag & ADDRB)
+		bits++;
 
 	return bits;
 }
@@ -353,6 +355,8 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios)
 	old_termios = tty->termios;
 	tty->termios = *new_termios;
 	unset_locked_termios(tty, &old_termios);
+	/* Reset any ADDRB changes, ADDRB is changed through ->rs485_config() */
+	tty->termios.c_cflag ^= (tty->termios.c_cflag ^ old_termios.c_cflag) & ADDRB;
 
 	if (tty->ops->set_termios)
 		tty->ops->set_termios(tty, &old_termios);
diff --git a/include/uapi/asm-generic/termbits-common.h b/include/uapi/asm-generic/termbits-common.h
index 4d084fe8def5..4a6a79f28b21 100644
--- a/include/uapi/asm-generic/termbits-common.h
+++ b/include/uapi/asm-generic/termbits-common.h
@@ -46,6 +46,7 @@ typedef unsigned int	speed_t;
 #define EXTA		B19200
 #define EXTB		B38400
 
+#define ADDRB		0x20000000	/* address bit */
 #define CMSPAR		0x40000000	/* mark or space (stick) parity */
 #define CRTSCTS		0x80000000	/* flow control */
 
diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h
index fa6b16e5fdd8..cea06924b295 100644
--- a/include/uapi/linux/serial.h
+++ b/include/uapi/linux/serial.h
@@ -126,10 +126,26 @@ struct serial_rs485 {
 #define SER_RS485_TERMINATE_BUS		(1 << 5)	/* Enable bus
 							   termination
 							   (if supported) */
+
+/* RS-485 addressing mode */
+#define SER_RS485_ADDRB			(1 << 6)	/* Enable addressing mode */
+#define SER_RS485_ADDR_RECV		(1 << 7)	/* Receive address filter */
+#define SER_RS485_ADDR_DEST		(1 << 8)	/* Destination address */
+
 	__u32	delay_rts_before_send;	/* Delay before send (milliseconds) */
 	__u32	delay_rts_after_send;	/* Delay after send (milliseconds) */
-	__u32	padding[5];		/* Memory is cheap, new structs
-					   are a royal PITA .. */
+
+	/* The fields below are defined by flags */
+	union {
+		__u32	padding[5];		/* Memory is cheap, new structs are a pain */
+
+		struct {
+			__u8	addr_recv;
+			__u8	addr_dest;
+			__u8	padding0[2];
+			__u32	padding1[4];
+		};
+	};
 };
 
 /*
-- 
cgit v1.2.3


From ec5ad331680c96ef3dd30dc297b206988023b9e1 Mon Sep 17 00:00:00 2001
From: Max Staudt <max@enpas.org>
Date: Sat, 18 Jun 2022 20:01:34 +0200
Subject: tty: Add N_CAN327 line discipline ID for ELM327 based CAN driver

The actual driver will be added via the CAN tree.

Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Max Staudt <max@enpas.org>
Link: https://lore.kernel.org/r/20220618180134.9890-1-max@enpas.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 9d0f06bfbac3..68aeae2addec 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -38,8 +38,9 @@
 #define N_NULL		27	/* Null ldisc used for error handling */
 #define N_MCTP		28	/* MCTP-over-serial */
 #define N_DEVELOPMENT	29	/* Manual out-of-tree testing */
+#define N_CAN327	30	/* ELM327 based OBD-II interfaces */
 
 /* Always the newest line discipline + 1 */
-#define NR_LDISCS	30
+#define NR_LDISCS	31
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From 9864bb4801331daa48514face9d0f4861e4d485b Mon Sep 17 00:00:00 2001
From: Li Li <dualli@google.com>
Date: Thu, 26 May 2022 15:00:18 -0700
Subject: Binder: add TF_UPDATE_TXN to replace outdated txn

When the target process is busy, incoming oneway transactions are
queued in the async_todo list. If the clients continue sending extra
oneway transactions while the target process is frozen, this queue can
become too large to accommodate new transactions. That's why binder
driver introduced ONEWAY_SPAM_DETECTION to detect this situation. It's
helpful to debug the async binder buffer exhausting issue, but the
issue itself isn't solved directly.

In real cases applications are designed to send oneway transactions
repeatedly, delivering updated inforamtion to the target process.
Typical examples are Wi-Fi signal strength and some real time sensor
data. Even if the apps might only care about the lastet information,
all outdated oneway transactions are still accumulated there until the
frozen process is thawed later. For this kind of situations, there's
no existing method to skip those outdated transactions and deliver the
latest one only.

This patch introduces a new transaction flag TF_UPDATE_TXN. To use it,
use apps can set this new flag along with TF_ONE_WAY. When such an
oneway transaction is to be queued into the async_todo list of a frozen
process, binder driver will check if any previous pending transactions
can be superseded by comparing their code, flags and target node. If
such an outdated pending transaction is found, the latest transaction
will supersede that outdated one. This effectively prevents the async
binder buffer running out and saves unnecessary binder read workloads.

Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Li Li <dualli@google.com>
Link: https://lore.kernel.org/r/20220526220018.3334775-2-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 85 +++++++++++++++++++++++++++++++++++--
 drivers/android/binder_trace.h      |  4 ++
 include/uapi/linux/android/binder.h |  1 +
 3 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 362c0deb65f1..d4f84f25c30b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2626,6 +2626,56 @@ static int binder_fixup_parent(struct list_head *pf_head,
 	return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0);
 }
 
+/**
+ * binder_can_update_transaction() - Can a txn be superseded by an updated one?
+ * @t1: the pending async txn in the frozen process
+ * @t2: the new async txn to supersede the outdated pending one
+ *
+ * Return:  true if t2 can supersede t1
+ *          false if t2 can not supersede t1
+ */
+static bool binder_can_update_transaction(struct binder_transaction *t1,
+					  struct binder_transaction *t2)
+{
+	if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) !=
+	    (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc)
+		return false;
+	if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code &&
+	    t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid &&
+	    t1->buffer->target_node->ptr == t2->buffer->target_node->ptr &&
+	    t1->buffer->target_node->cookie == t2->buffer->target_node->cookie)
+		return true;
+	return false;
+}
+
+/**
+ * binder_find_outdated_transaction_ilocked() - Find the outdated transaction
+ * @t:		 new async transaction
+ * @target_list: list to find outdated transaction
+ *
+ * Return: the outdated transaction if found
+ *         NULL if no outdated transacton can be found
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static struct binder_transaction *
+binder_find_outdated_transaction_ilocked(struct binder_transaction *t,
+					 struct list_head *target_list)
+{
+	struct binder_work *w;
+
+	list_for_each_entry(w, target_list, entry) {
+		struct binder_transaction *t_queued;
+
+		if (w->type != BINDER_WORK_TRANSACTION)
+			continue;
+		t_queued = container_of(w, struct binder_transaction, work);
+		if (binder_can_update_transaction(t_queued, t))
+			return t_queued;
+	}
+	return NULL;
+}
+
 /**
  * binder_proc_transaction() - sends a transaction to a process and wakes it up
  * @t:		transaction to send
@@ -2651,6 +2701,7 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	struct binder_node *node = t->buffer->target_node;
 	bool oneway = !!(t->flags & TF_ONE_WAY);
 	bool pending_async = false;
+	struct binder_transaction *t_outdated = NULL;
 
 	BUG_ON(!node);
 	binder_node_lock(node);
@@ -2678,12 +2729,24 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	if (!thread && !pending_async)
 		thread = binder_select_thread_ilocked(proc);
 
-	if (thread)
+	if (thread) {
 		binder_enqueue_thread_work_ilocked(thread, &t->work);
-	else if (!pending_async)
+	} else if (!pending_async) {
 		binder_enqueue_work_ilocked(&t->work, &proc->todo);
-	else
+	} else {
+		if ((t->flags & TF_UPDATE_TXN) && proc->is_frozen) {
+			t_outdated = binder_find_outdated_transaction_ilocked(t,
+									      &node->async_todo);
+			if (t_outdated) {
+				binder_debug(BINDER_DEBUG_TRANSACTION,
+					     "txn %d supersedes %d\n",
+					     t->debug_id, t_outdated->debug_id);
+				list_del_init(&t_outdated->work.entry);
+				proc->outstanding_txns--;
+			}
+		}
 		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
+	}
 
 	if (!pending_async)
 		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
@@ -2692,6 +2755,22 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	binder_inner_proc_unlock(proc);
 	binder_node_unlock(node);
 
+	/*
+	 * To reduce potential contention, free the outdated transaction and
+	 * buffer after releasing the locks.
+	 */
+	if (t_outdated) {
+		struct binder_buffer *buffer = t_outdated->buffer;
+
+		t_outdated->buffer = NULL;
+		buffer->transaction = NULL;
+		trace_binder_transaction_update_buffer_release(buffer);
+		binder_transaction_buffer_release(proc, NULL, buffer, 0, 0);
+		binder_alloc_free_buf(&proc->alloc, buffer);
+		kfree(t_outdated);
+		binder_stats_deleted(BINDER_STAT_TRANSACTION);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 8eeccdc64724..8cc07e6a4273 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -311,6 +311,10 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
 	TP_PROTO(struct binder_buffer *buffer),
 	TP_ARGS(buffer));
 
+DEFINE_EVENT(binder_buffer_class, binder_transaction_update_buffer_release,
+	     TP_PROTO(struct binder_buffer *buffer),
+	     TP_ARGS(buffer));
+
 TRACE_EVENT(binder_update_page_range,
 	TP_PROTO(struct binder_alloc *alloc, bool allocate,
 		 void __user *start, void __user *end),
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 986333cf5bbe..e72e4de8f452 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -287,6 +287,7 @@ enum transaction_flags {
 	TF_STATUS_CODE	= 0x08,	/* contents are a 32-bit status code */
 	TF_ACCEPT_FDS	= 0x10,	/* allow replies with file descriptors */
 	TF_CLEAR_BUF	= 0x20,	/* clear buffer on txn complete */
+	TF_UPDATE_TXN	= 0x40,	/* update the outdated pending async txn */
 };
 
 struct binder_transaction_data {
-- 
cgit v1.2.3


From 713eb3c1261a1f89e35bdf233265aa5a2c46e9b2 Mon Sep 17 00:00:00 2001
From: Max Staudt <max@enpas.org>
Date: Sat, 18 Jun 2022 20:01:34 +0200
Subject: tty: Add N_CAN327 line discipline ID for ELM327 based CAN driver

The actual driver will be added via the CAN tree.

Link: https://lore.kernel.org/all/20220618180134.9890-1-max@enpas.org
Link: https://lore.kernel.org/all/Yrm9Ezlw1dLmIxyS@kroah.com
Signed-off-by: Max Staudt <max@enpas.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/tty.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index 9d0f06bfbac3..68aeae2addec 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -38,8 +38,9 @@
 #define N_NULL		27	/* Null ldisc used for error handling */
 #define N_MCTP		28	/* MCTP-over-serial */
 #define N_DEVELOPMENT	29	/* Manual out-of-tree testing */
+#define N_CAN327	30	/* ELM327 based OBD-II interfaces */
 
 /* Always the newest line discipline + 1 */
-#define NR_LDISCS	30
+#define NR_LDISCS	31
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From 119a784c81270eb88e573174ed2209225d646656 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 16 Jun 2022 11:06:23 -0700
Subject: perf/core: Add a new read format to get a number of lost samples

Sometimes we want to know an accurate number of samples even if it's
lost.  Currenlty PERF_RECORD_LOST is generated for a ring-buffer which
might be shared with other events.  So it's hard to know per-event
lost count.

Add event->lost_samples field and PERF_FORMAT_LOST to retrieve it from
userspace.

Original-patch-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220616180623.1358843-1-namhyung@kernel.org
---
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  5 ++++-
 kernel/events/core.c            | 21 ++++++++++++++++++---
 kernel/events/ring_buffer.c     |  5 ++++-
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index da759560eec5..ee8b9ecdc03b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -759,6 +759,8 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	atomic64_t			lost_samples;
+
 	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..0474ee362151 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -301,6 +301,7 @@ enum {
  *	  { u64		time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		id;           } && PERF_FORMAT_ID
+ *	  { u64		lost;         } && PERF_FORMAT_LOST
  *	} && !PERF_FORMAT_GROUP
  *
  *	{ u64		nr;
@@ -308,6 +309,7 @@ enum {
  *	  { u64		time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
  *	  { u64		value;
  *	    { u64	id;           } && PERF_FORMAT_ID
+ *	    { u64	lost;         } && PERF_FORMAT_LOST
  *	  }		cntr[nr];
  *	} && PERF_FORMAT_GROUP
  * };
@@ -317,8 +319,9 @@ enum perf_event_read_format {
 	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
 	PERF_FORMAT_ID				= 1U << 2,
 	PERF_FORMAT_GROUP			= 1U << 3,
+	PERF_FORMAT_LOST			= 1U << 4,
 
-	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 5,		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..4d8c335a07db 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 	if (event->attr.read_format & PERF_FORMAT_ID)
 		entry += sizeof(u64);
 
+	if (event->attr.read_format & PERF_FORMAT_LOST)
+		entry += sizeof(u64);
+
 	if (event->attr.read_format & PERF_FORMAT_GROUP) {
 		nr += nr_siblings;
 		size += sizeof(u64);
@@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
 	values[n++] += perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&leader->lost_samples);
 
 	for_each_sibling_event(sub, leader) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
+		if (read_format & PERF_FORMAT_LOST)
+			values[n++] = atomic64_read(&sub->lost_samples);
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
 	u64 enabled, running;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
 		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&event->lost_samples);
 
 	if (copy_to_user(buf, values, n * sizeof(u64)))
 		return -EFAULT;
@@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 				 u64 enabled, u64 running)
 {
 	u64 read_format = event->attr.read_format;
-	u64 values[4];
+	u64 values[5];
 	int n = 0;
 
 	values[n++] = perf_event_count(event);
@@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	}
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&event->lost_samples);
 
 	__output_copy(handle, values, n * sizeof(u64));
 }
@@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
-	u64 values[5];
+	u64 values[6];
 	int n = 0;
 
 	values[n++] = 1 + leader->nr_siblings;
@@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
+	if (read_format & PERF_FORMAT_LOST)
+		values[n++] = atomic64_read(&leader->lost_samples);
 
 	__output_copy(handle, values, n * sizeof(u64));
 
@@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
+		if (read_format & PERF_FORMAT_LOST)
+			values[n++] = atomic64_read(&sub->lost_samples);
 
 		__output_copy(handle, values, n * sizeof(u64));
 	}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb35b926024c..726132039c38 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
 		goto out;
 
 	if (unlikely(rb->paused)) {
-		if (rb->nr_pages)
+		if (rb->nr_pages) {
 			local_inc(&rb->lost);
+			atomic64_inc(&event->lost_samples);
+		}
 		goto out;
 	}
 
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
 
 fail:
 	local_inc(&rb->lost);
+	atomic64_inc(&event->lost_samples);
 	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 94dfc73e7cf4a31da66b8843f0b9283ddd6b8381 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 6 Apr 2022 19:36:51 -0500
Subject: treewide: uapi: Replace zero-length arrays with flexible-array
 members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

This code was transformed with the help of Coccinelle:
(linux-5.19-rc2$ spatch --jobs $(getconf _NPROCESSORS_ONLN) --sp-file script.cocci --include-headers --dir . > output.patch)

@@
identifier S, member, array;
type T1, T2;
@@

struct S {
  ...
  T1 member;
  T2 array[
- 0
  ];
};

-fstrict-flex-arrays=3 is coming and we need to land these changes
to prevent issues like these in the short future:

../fs/minix/dir.c:337:3: warning: 'strcpy' will always overflow; destination buffer has size 0,
but the source string has length 2 (including NUL byte) [-Wfortify-source]
		strcpy(de3->name, ".");
		^

Since these are all [0] to [] changes, the risk to UAPI is nearly zero. If
this breaks anything, we can use a union with a new member name.

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Build-tested-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/62b675ec.wKX6AOZ6cbE71vtF%25lkp@intel.com/
Acked-by: Dan Williams <dan.j.williams@intel.com> # For ndctl.h
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 arch/m68k/include/uapi/asm/bootinfo.h           |  4 +-
 arch/mips/include/uapi/asm/ucontext.h           |  2 +-
 arch/s390/include/uapi/asm/hwctrset.h           |  6 +--
 arch/x86/include/uapi/asm/bootparam.h           |  2 +-
 arch/x86/include/uapi/asm/kvm.h                 | 12 ++---
 include/uapi/drm/i915_drm.h                     |  6 +--
 include/uapi/linux/blkzoned.h                   |  2 +-
 include/uapi/linux/bpf.h                        |  2 +-
 include/uapi/linux/btrfs.h                      | 10 ++--
 include/uapi/linux/btrfs_tree.h                 |  2 +-
 include/uapi/linux/can/bcm.h                    |  2 +-
 include/uapi/linux/connector.h                  |  2 +-
 include/uapi/linux/cycx_cfm.h                   |  2 +-
 include/uapi/linux/dm-ioctl.h                   |  8 +--
 include/uapi/linux/dm-log-userspace.h           |  2 +-
 include/uapi/linux/ethtool.h                    | 28 +++++-----
 include/uapi/linux/fanotify.h                   |  2 +-
 include/uapi/linux/fiemap.h                     |  2 +-
 include/uapi/linux/firewire-cdev.h              | 12 ++---
 include/uapi/linux/fs.h                         |  2 +-
 include/uapi/linux/if_alg.h                     |  2 +-
 include/uapi/linux/if_arcnet.h                  |  6 +--
 include/uapi/linux/if_pppox.h                   |  4 +-
 include/uapi/linux/if_tun.h                     |  2 +-
 include/uapi/linux/igmp.h                       |  6 +--
 include/uapi/linux/inet_diag.h                  |  2 +-
 include/uapi/linux/inotify.h                    |  2 +-
 include/uapi/linux/ip.h                         |  4 +-
 include/uapi/linux/ip_vs.h                      |  4 +-
 include/uapi/linux/iso_fs.h                     |  4 +-
 include/uapi/linux/jffs2.h                      |  8 +--
 include/uapi/linux/kcov.h                       |  2 +-
 include/uapi/linux/kvm.h                        |  8 +--
 include/uapi/linux/minix_fs.h                   |  4 +-
 include/uapi/linux/mmc/ioctl.h                  |  2 +-
 include/uapi/linux/ndctl.h                      | 10 ++--
 include/uapi/linux/net_dropmon.h                |  4 +-
 include/uapi/linux/netfilter/x_tables.h         |  4 +-
 include/uapi/linux/netfilter_arp/arp_tables.h   |  6 +--
 include/uapi/linux/netfilter_bridge/ebt_among.h |  2 +-
 include/uapi/linux/netfilter_ipv4/ip_tables.h   |  6 +--
 include/uapi/linux/netfilter_ipv6/ip6_tables.h  |  4 +-
 include/uapi/linux/perf_event.h                 |  2 +-
 include/uapi/linux/pkt_cls.h                    |  4 +-
 include/uapi/linux/raid/md_p.h                  |  2 +-
 include/uapi/linux/random.h                     |  2 +-
 include/uapi/linux/romfs_fs.h                   |  4 +-
 include/uapi/linux/rtnetlink.h                  |  2 +-
 include/uapi/linux/sctp.h                       | 10 ++--
 include/uapi/linux/seg6.h                       |  2 +-
 include/uapi/linux/seg6_iptunnel.h              |  2 +-
 include/uapi/linux/stm.h                        |  2 +-
 include/uapi/linux/target_core_user.h           |  2 +-
 include/uapi/linux/usb/audio.h                  |  2 +-
 include/uapi/linux/usb/cdc.h                    |  6 +--
 include/uapi/linux/usb/ch9.h                    |  2 +-
 include/uapi/linux/usb/raw_gadget.h             |  4 +-
 include/uapi/linux/usbdevice_fs.h               |  4 +-
 include/uapi/linux/vhost_types.h                |  4 +-
 include/uapi/linux/virtio_9p.h                  |  2 +-
 include/uapi/linux/xfrm.h                       | 10 ++--
 include/uapi/rdma/hfi/hfi1_user.h               |  2 +-
 include/uapi/rdma/ib_user_verbs.h               | 72 ++++++++++++-------------
 include/uapi/rdma/rdma_user_cm.h                |  2 +-
 include/uapi/rdma/rdma_user_ioctl_cmds.h        |  2 +-
 include/uapi/scsi/fc/fc_els.h                   | 18 +++----
 include/uapi/scsi/scsi_bsg_fc.h                 |  2 +-
 include/uapi/sound/asound.h                     |  2 +-
 include/uapi/sound/firewire.h                   |  6 +--
 include/uapi/sound/skl-tplg-interface.h         |  2 +-
 include/uapi/sound/sof/header.h                 |  2 +-
 include/uapi/sound/usb_stream.h                 |  2 +-
 tools/arch/x86/include/uapi/asm/kvm.h           | 12 ++---
 tools/include/uapi/drm/i915_drm.h               |  6 +--
 tools/include/uapi/linux/fs.h                   |  2 +-
 tools/include/uapi/linux/if_tun.h               |  2 +-
 tools/include/uapi/linux/kvm.h                  |  8 +--
 tools/include/uapi/linux/perf_event.h           |  2 +-
 tools/include/uapi/linux/pkt_cls.h              |  4 +-
 tools/include/uapi/linux/seg6.h                 |  4 +-
 tools/include/uapi/linux/usbdevice_fs.h         |  4 +-
 tools/include/uapi/sound/asound.h               |  2 +-
 82 files changed, 216 insertions(+), 216 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h
index 203d9cbf9630..95ecf3ae4c49 100644
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -34,7 +34,7 @@
 struct bi_record {
 	__be16 tag;			/* tag ID */
 	__be16 size;			/* size of record (in bytes) */
-	__be32 data[0];			/* data */
+	__be32 data[];			/* data */
 };
 
 
@@ -168,7 +168,7 @@ struct bootversion {
 	struct {
 		__be32 machtype;
 		__be32 version;
-	} machversions[0];
+	} machversions[];
 } __packed;
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/mips/include/uapi/asm/ucontext.h b/arch/mips/include/uapi/asm/ucontext.h
index 2d3bf8eebf1f..6122ef97c6ff 100644
--- a/arch/mips/include/uapi/asm/ucontext.h
+++ b/arch/mips/include/uapi/asm/ucontext.h
@@ -60,7 +60,7 @@ struct ucontext {
 	sigset_t		uc_sigmask;
 
 	/* Extended context structures may follow ucontext */
-	unsigned long long	uc_extcontext[0];
+	unsigned long long	uc_extcontext[];
 };
 
 #endif /* __MIPS_UAPI_ASM_UCONTEXT_H */
diff --git a/arch/s390/include/uapi/asm/hwctrset.h b/arch/s390/include/uapi/asm/hwctrset.h
index 3d8284b95f87..e56b9dd23a4b 100644
--- a/arch/s390/include/uapi/asm/hwctrset.h
+++ b/arch/s390/include/uapi/asm/hwctrset.h
@@ -30,18 +30,18 @@ struct s390_ctrset_start {		/* Set CPUs to operate on */
 struct s390_ctrset_setdata {		/* Counter set data */
 	__u32 set;			/* Counter set number */
 	__u32 no_cnts;			/* # of counters stored in cv[] */
-	__u64 cv[0];			/* Counter values (variable length) */
+	__u64 cv[];			/* Counter values (variable length) */
 };
 
 struct s390_ctrset_cpudata {		/* Counter set data per CPU */
 	__u32 cpu_nr;			/* CPU number */
 	__u32 no_sets;			/* # of counters sets in data[] */
-	struct s390_ctrset_setdata data[0];
+	struct s390_ctrset_setdata data[];
 };
 
 struct s390_ctrset_read {		/* Structure to get all ctr sets */
 	__u64 no_cpus;			/* Total # of CPUs data taken from */
-	struct s390_ctrset_cpudata data[0];
+	struct s390_ctrset_cpudata data[];
 };
 
 #define S390_HWCTR_MAGIC	'C'	/* Random magic # for ioctls */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index bea5cdcdf532..cdd6c7f6cfa6 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -52,7 +52,7 @@ struct setup_data {
 	__u64 next;
 	__u32 type;
 	__u32 len;
-	__u8 data[0];
+	__u8 data[];
 };
 
 /* extensible setup indirect data node */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 21614807a2cb..ec53c9fa1da9 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -198,13 +198,13 @@ struct kvm_msrs {
 	__u32 nmsrs; /* number of msrs in entries */
 	__u32 pad;
 
-	struct kvm_msr_entry entries[0];
+	struct kvm_msr_entry entries[];
 };
 
 /* for KVM_GET_MSR_INDEX_LIST */
 struct kvm_msr_list {
 	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
+	__u32 indices[];
 };
 
 /* Maximum size of any access bitmap in bytes */
@@ -241,7 +241,7 @@ struct kvm_cpuid_entry {
 struct kvm_cpuid {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
+	struct kvm_cpuid_entry entries[];
 };
 
 struct kvm_cpuid_entry2 {
@@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 {
 struct kvm_cpuid2 {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry2 entries[0];
+	struct kvm_cpuid_entry2 entries[];
 };
 
 /* for KVM_GET_PIT and KVM_SET_PIT */
@@ -389,7 +389,7 @@ struct kvm_xsave {
 	 * the contents of CPUID leaf 0xD on the host.
 	 */
 	__u32 region[1024];
-	__u32 extra[0];
+	__u32 extra[];
 };
 
 #define KVM_MAX_XCRS	16
@@ -516,7 +516,7 @@ struct kvm_pmu_event_filter {
 	__u32 fixed_counter_bitmap;
 	__u32 flags;
 	__u32 pad[4];
-	__u64 events[0];
+	__u64 events[];
 };
 
 #define KVM_PMU_EVENT_ALLOW 0
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a2def7b27009..b28ff5d88145 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2123,7 +2123,7 @@ struct i915_context_engines_load_balance {
 
 	__u64 mbz64; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
@@ -2161,7 +2161,7 @@ struct i915_context_engines_bond {
 	__u64 flags; /* all undefined flags must be zero */
 	__u64 mbz64[4]; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \
@@ -2288,7 +2288,7 @@ struct i915_context_engines_parallel_submit {
 	 * length = width (i) * num_siblings (j)
 	 * index = j + i * num_siblings
 	 */
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 
 } __packed;
 
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 656a326821a2..b80fcc9ea525 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -130,7 +130,7 @@ struct blk_zone_report {
 	__u64		sector;
 	__u32		nr_zones;
 	__u32		flags;
-	struct blk_zone zones[0];
+	struct blk_zone zones[];
 };
 
 /**
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..e4b33ba06f00 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -79,7 +79,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
-	__u8	data[0];	/* Arbitrary size */
+	__u8	data[];	/* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..3d0edbe3b991 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -93,7 +93,7 @@ struct btrfs_qgroup_inherit {
 	__u64	num_ref_copies;
 	__u64	num_excl_copies;
 	struct btrfs_qgroup_limit lim;
-	__u64	qgroups[0];
+	__u64	qgroups[];
 };
 
 struct btrfs_ioctl_qgroup_limit_args {
@@ -561,7 +561,7 @@ struct btrfs_ioctl_search_args_v2 {
 	__u64 buf_size;		   /* in - size of buffer
 					    * out - on EOVERFLOW: needed size
 					    *       to store item */
-	__u64 buf[0];                       /* out - found items */
+	__u64 buf[];                       /* out - found items */
 };
 
 struct btrfs_ioctl_clone_range_args {
@@ -632,7 +632,7 @@ struct btrfs_ioctl_same_args {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;
 	__u32 reserved2;
-	struct btrfs_ioctl_same_extent_info info[0];
+	struct btrfs_ioctl_same_extent_info info[];
 };
 
 struct btrfs_ioctl_space_info {
@@ -644,7 +644,7 @@ struct btrfs_ioctl_space_info {
 struct btrfs_ioctl_space_args {
 	__u64 space_slots;
 	__u64 total_spaces;
-	struct btrfs_ioctl_space_info spaces[0];
+	struct btrfs_ioctl_space_info spaces[];
 };
 
 struct btrfs_data_container {
@@ -652,7 +652,7 @@ struct btrfs_data_container {
 	__u32	bytes_missing;	/* out -- additional bytes needed for result */
 	__u32	elem_cnt;	/* out */
 	__u32	elem_missed;	/* out */
-	__u64	val[0];		/* out */
+	__u64	val[];		/* out */
 };
 
 struct btrfs_ioctl_ino_path_args {
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index d4117152d907..5f32a2a495dc 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -575,7 +575,7 @@ struct btrfs_inode_extref {
 	__le64 parent_objectid;
 	__le64 index;
 	__le16 name_len;
-	__u8   name[0];
+	__u8   name[];
 	/* name goes here */
 } __attribute__ ((__packed__));
 
diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h
index dd2b925b09ac..f1e45f533a72 100644
--- a/include/uapi/linux/can/bcm.h
+++ b/include/uapi/linux/can/bcm.h
@@ -71,7 +71,7 @@ struct bcm_msg_head {
 	struct bcm_timeval ival1, ival2;
 	canid_t can_id;
 	__u32 nframes;
-	struct can_frame frames[0];
+	struct can_frame frames[];
 };
 
 enum {
diff --git a/include/uapi/linux/connector.h b/include/uapi/linux/connector.h
index 3738936149a2..5ae131c3f145 100644
--- a/include/uapi/linux/connector.h
+++ b/include/uapi/linux/connector.h
@@ -75,7 +75,7 @@ struct cn_msg {
 
 	__u16 len;		/* Length of the following data */
 	__u16 flags;
-	__u8 data[0];
+	__u8 data[];
 };
 
 #endif /* _UAPI__CONNECTOR_H */
diff --git a/include/uapi/linux/cycx_cfm.h b/include/uapi/linux/cycx_cfm.h
index 51f541942ff9..91778c8024b1 100644
--- a/include/uapi/linux/cycx_cfm.h
+++ b/include/uapi/linux/cycx_cfm.h
@@ -91,7 +91,7 @@ struct cycx_firmware {
 	unsigned short	    reserved[6];
 	char		    descr[CFM_DESCR_LEN];
 	struct cycx_fw_info info;
-	unsigned char	    image[0];
+	unsigned char	    image[];
 };
 
 struct cycx_fw_header {
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 2e9550fef90f..8c97d75f3104 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -182,7 +182,7 @@ struct dm_target_spec {
 struct dm_target_deps {
 	__u32 count;	/* Array size */
 	__u32 padding;	/* unused */
-	__u64 dev[0];	/* out */
+	__u64 dev[];	/* out */
 };
 
 /*
@@ -192,7 +192,7 @@ struct dm_name_list {
 	__u64 dev;
 	__u32 next;		/* offset to the next record from
 				   the _start_ of this */
-	char name[0];
+	char name[];
 
 	/*
 	 * The following members can be accessed by taking a pointer that
@@ -216,7 +216,7 @@ struct dm_target_versions {
         __u32 next;
         __u32 version[3];
 
-        char name[0];
+        char name[];
 };
 
 /*
@@ -225,7 +225,7 @@ struct dm_target_versions {
 struct dm_target_msg {
 	__u64 sector;	/* Device sector */
 
-	char message[0];
+	char message[];
 };
 
 /*
diff --git a/include/uapi/linux/dm-log-userspace.h b/include/uapi/linux/dm-log-userspace.h
index 5c47a8603376..23dad9565e46 100644
--- a/include/uapi/linux/dm-log-userspace.h
+++ b/include/uapi/linux/dm-log-userspace.h
@@ -426,7 +426,7 @@ struct dm_ulog_request {
 	__u32 request_type;  /* DM_ULOG_* defined above */
 	__u32 data_size;     /* How much data (not including this struct) */
 
-	char data[0];
+	char data[];
 };
 
 #endif /* __DM_LOG_USERSPACE_H__ */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index e0f0ee9bc89e..2d5741fd44bb 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -257,7 +257,7 @@ struct ethtool_tunable {
 	__u32	id;
 	__u32	type_id;
 	__u32	len;
-	void	*data[0];
+	void	*data[];
 };
 
 #define DOWNSHIFT_DEV_DEFAULT_COUNT	0xff
@@ -322,7 +322,7 @@ struct ethtool_regs {
 	__u32	cmd;
 	__u32	version;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -348,7 +348,7 @@ struct ethtool_eeprom {
 	__u32	magic;
 	__u32	offset;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -752,7 +752,7 @@ struct ethtool_gstrings {
 	__u32	cmd;
 	__u32	string_set;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /**
@@ -777,7 +777,7 @@ struct ethtool_sset_info {
 	__u32	cmd;
 	__u32	reserved;
 	__u64	sset_mask;
-	__u32	data[0];
+	__u32	data[];
 };
 
 /**
@@ -817,7 +817,7 @@ struct ethtool_test {
 	__u32	flags;
 	__u32	reserved;
 	__u32	len;
-	__u64	data[0];
+	__u64	data[];
 };
 
 /**
@@ -834,7 +834,7 @@ struct ethtool_test {
 struct ethtool_stats {
 	__u32	cmd;
 	__u32	n_stats;
-	__u64	data[0];
+	__u64	data[];
 };
 
 /**
@@ -851,7 +851,7 @@ struct ethtool_stats {
 struct ethtool_perm_addr {
 	__u32	cmd;
 	__u32	size;
-	__u8	data[0];
+	__u8	data[];
 };
 
 /* boolean flags controlling per-interface behavior characteristics.
@@ -1160,7 +1160,7 @@ struct ethtool_rxnfc {
 struct ethtool_rxfh_indir {
 	__u32	cmd;
 	__u32	size;
-	__u32	ring_index[0];
+	__u32	ring_index[];
 };
 
 /**
@@ -1201,7 +1201,7 @@ struct ethtool_rxfh {
 	__u8	hfunc;
 	__u8	rsvd8[3];
 	__u32	rsvd32;
-	__u32   rss_config[0];
+	__u32   rss_config[];
 };
 #define ETH_RXFH_CONTEXT_ALLOC		0xffffffff
 #define ETH_RXFH_INDIR_NO_CHANGE	0xffffffff
@@ -1286,7 +1286,7 @@ struct ethtool_dump {
 	__u32	version;
 	__u32	flag;
 	__u32	len;
-	__u8	data[0];
+	__u8	data[];
 };
 
 #define ETH_FW_DUMP_DISABLE 0
@@ -1318,7 +1318,7 @@ struct ethtool_get_features_block {
 struct ethtool_gfeatures {
 	__u32	cmd;
 	__u32	size;
-	struct ethtool_get_features_block features[0];
+	struct ethtool_get_features_block features[];
 };
 
 /**
@@ -1340,7 +1340,7 @@ struct ethtool_set_features_block {
 struct ethtool_sfeatures {
 	__u32	cmd;
 	__u32	size;
-	struct ethtool_set_features_block features[0];
+	struct ethtool_set_features_block features[];
 };
 
 /**
@@ -2087,7 +2087,7 @@ struct ethtool_link_settings {
 	__u8	master_slave_state;
 	__u8	reserved1[1];
 	__u32	reserved[7];
-	__u32	link_mode_masks[0];
+	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
 	 * __u32 map_advertising[link_mode_masks_nwords];
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..197df344307d 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -162,7 +162,7 @@ struct fanotify_event_info_fid {
 	 * Following is an opaque struct file_handle that can be passed as
 	 * an argument to open_by_handle_at(2).
 	 */
-	unsigned char handle[0];
+	unsigned char handle[];
 };
 
 /*
diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h
index 07c1cdcb715e..24ca0c00cae3 100644
--- a/include/uapi/linux/fiemap.h
+++ b/include/uapi/linux/fiemap.h
@@ -34,7 +34,7 @@ struct fiemap {
 	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
 	__u32 fm_extent_count;  /* size of fm_extents array (in) */
 	__u32 fm_reserved;
-	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+	struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */
 };
 
 #define FIEMAP_MAX_OFFSET	(~0ULL)
diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h
index 5effa9832802..92be3ea3c6e0 100644
--- a/include/uapi/linux/firewire-cdev.h
+++ b/include/uapi/linux/firewire-cdev.h
@@ -118,7 +118,7 @@ struct fw_cdev_event_response {
 	__u32 type;
 	__u32 rcode;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -142,7 +142,7 @@ struct fw_cdev_event_request {
 	__u64 offset;
 	__u32 handle;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -205,7 +205,7 @@ struct fw_cdev_event_request2 {
 	__u32 generation;
 	__u32 handle;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -265,7 +265,7 @@ struct fw_cdev_event_iso_interrupt {
 	__u32 type;
 	__u32 cycle;
 	__u32 header_length;
-	__u32 header[0];
+	__u32 header[];
 };
 
 /**
@@ -355,7 +355,7 @@ struct fw_cdev_event_phy_packet {
 	__u32 type;
 	__u32 rcode;
 	__u32 length;
-	__u32 data[0];
+	__u32 data[];
 };
 
 /**
@@ -803,7 +803,7 @@ struct fw_cdev_set_iso_channels {
  */
 struct fw_cdev_iso_packet {
 	__u32 control;
-	__u32 header[0];
+	__u32 header[];
 };
 
 /**
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..b7b56871029c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -90,7 +90,7 @@ struct file_dedupe_range {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;	/* must be zero */
 	__u32 reserved2;	/* must be zero */
-	struct file_dedupe_range_info info[0];
+	struct file_dedupe_range_info info[];
 };
 
 /* And dynamically-tunable limits and defaults: */
diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h
index dc52a11ba6d1..578b18aab821 100644
--- a/include/uapi/linux/if_alg.h
+++ b/include/uapi/linux/if_alg.h
@@ -42,7 +42,7 @@ struct sockaddr_alg_new {
 
 struct af_alg_iv {
 	__u32	ivlen;
-	__u8	iv[0];
+	__u8	iv[];
 };
 
 /* Socket options */
diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h
index 683878036d76..b122cfac7128 100644
--- a/include/uapi/linux/if_arcnet.h
+++ b/include/uapi/linux/if_arcnet.h
@@ -60,7 +60,7 @@ struct arc_rfc1201 {
 	__u8  proto;		/* protocol ID field - varies		*/
 	__u8  split_flag;	/* for use with split packets		*/
 	__be16   sequence;	/* sequence number			*/
-	__u8  payload[0];	/* space remaining in packet (504 bytes)*/
+	__u8  payload[];	/* space remaining in packet (504 bytes)*/
 };
 #define RFC1201_HDR_SIZE 4
 
@@ -69,7 +69,7 @@ struct arc_rfc1201 {
  */
 struct arc_rfc1051 {
 	__u8 proto;		/* ARC_P_RFC1051_ARP/RFC1051_IP	*/
-	__u8 payload[0];	/* 507 bytes			*/
+	__u8 payload[];	/* 507 bytes			*/
 };
 #define RFC1051_HDR_SIZE 1
 
@@ -80,7 +80,7 @@ struct arc_rfc1051 {
 struct arc_eth_encap {
 	__u8 proto;		/* Always ARC_P_ETHER			*/
 	struct ethhdr eth;	/* standard ethernet header (yuck!)	*/
-	__u8 payload[0];	/* 493 bytes				*/
+	__u8 payload[];	/* 493 bytes				*/
 };
 #define ETH_ENCAP_HDR_SIZE 14
 
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index e7a693c28f16..9abd80dcc46f 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -122,7 +122,7 @@ struct sockaddr_pppol2tpv3in6 {
 struct pppoe_tag {
 	__be16 tag_type;
 	__be16 tag_len;
-	char tag_data[0];
+	char tag_data[];
 } __attribute__ ((packed));
 
 /* Tag identifiers */
@@ -150,7 +150,7 @@ struct pppoe_hdr {
 	__u8 code;
 	__be16 sid;
 	__be16 length;
-	struct pppoe_tag tag[0];
+	struct pppoe_tag tag[];
 } __packed;
 
 /* Length of entire PPPoE + PPP header */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 454ae31b93c7..2ec07de1d73b 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -108,7 +108,7 @@ struct tun_pi {
 struct tun_filter {
 	__u16  flags; /* TUN_FLT_ flags see above */
 	__u16  count; /* Number of addresses */
-	__u8   addr[0][ETH_ALEN];
+	__u8   addr[][ETH_ALEN];
 };
 
 #endif /* _UAPI__IF_TUN_H */
diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h
index 90c28bc466c6..5930f2437cd1 100644
--- a/include/uapi/linux/igmp.h
+++ b/include/uapi/linux/igmp.h
@@ -48,7 +48,7 @@ struct igmpv3_grec {
 	__u8	grec_auxwords;
 	__be16	grec_nsrcs;
 	__be32	grec_mca;
-	__be32	grec_src[0];
+	__be32	grec_src[];
 };
 
 struct igmpv3_report {
@@ -57,7 +57,7 @@ struct igmpv3_report {
 	__sum16 csum;
 	__be16 resv2;
 	__be16 ngrec;
-	struct igmpv3_grec grec[0];
+	struct igmpv3_grec grec[];
 };
 
 struct igmpv3_query {
@@ -78,7 +78,7 @@ struct igmpv3_query {
 #endif
 	__u8 qqic;
 	__be16 nsrcs;
-	__be32 srcs[0];
+	__be32 srcs[];
 };
 
 #define IGMP_HOST_MEMBERSHIP_QUERY	0x11	/* From RFC1112 */
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 20ee93f0f876..50655de04c9b 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -104,7 +104,7 @@ struct inet_diag_hostcond {
 	__u8	family;
 	__u8	prefix_len;
 	int	port;
-	__be32	addr[0];
+	__be32	addr[];
 };
 
 struct inet_diag_markcond {
diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h
index 884b4846b630..b3e165853d5b 100644
--- a/include/uapi/linux/inotify.h
+++ b/include/uapi/linux/inotify.h
@@ -23,7 +23,7 @@ struct inotify_event {
 	__u32		mask;		/* watch mask */
 	__u32		cookie;		/* cookie to synchronize two events */
 	__u32		len;		/* length (including nulls) of name */
-	char		name[0];	/* stub for possible name */
+	char		name[];	/* stub for possible name */
 };
 
 /* the following are legal, implemented events that user-space can watch for */
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index e00bbb9c47bb..961ec16a26b8 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -112,13 +112,13 @@ struct ip_auth_hdr {
 	__be16 reserved;
 	__be32 spi;
 	__be32 seq_no;		/* Sequence number */
-	__u8  auth_data[0];	/* Variable len but >=4. Mind the 64 bit alignment! */
+	__u8  auth_data[];	/* Variable len but >=4. Mind the 64 bit alignment! */
 };
 
 struct ip_esp_hdr {
 	__be32 spi;
 	__be32 seq_no;		/* Sequence number */
-	__u8  enc_data[0];	/* Variable len but >=8. Mind the 64 bit alignment! */
+	__u8  enc_data[];	/* Variable len but >=8. Mind the 64 bit alignment! */
 };
 
 struct ip_comp_hdr {
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 4102ddcb4e14..1ed234e7f251 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -254,7 +254,7 @@ struct ip_vs_get_dests {
 	unsigned int		num_dests;
 
 	/* the real servers */
-	struct ip_vs_dest_entry	entrytable[0];
+	struct ip_vs_dest_entry	entrytable[];
 };
 
 
@@ -264,7 +264,7 @@ struct ip_vs_get_services {
 	unsigned int		num_services;
 
 	/* service table */
-	struct ip_vs_service_entry entrytable[0];
+	struct ip_vs_service_entry entrytable[];
 };
 
 
diff --git a/include/uapi/linux/iso_fs.h b/include/uapi/linux/iso_fs.h
index a2555176f6d1..758178f5b52d 100644
--- a/include/uapi/linux/iso_fs.h
+++ b/include/uapi/linux/iso_fs.h
@@ -137,7 +137,7 @@ struct iso_path_table{
 	__u8  name_len[2];	/* 721 */
 	__u8  extent[4];	/* 731 */
 	__u8  parent[2];	/* 721 */
-	char name[0];
+	char name[];
 } __attribute__((packed));
 
 /* high sierra is identical to iso, except that the date is only 6 bytes, and
@@ -154,7 +154,7 @@ struct iso_directory_record {
 	__u8 interleave			[ISODCL (28, 28)]; /* 711 */
 	__u8 volume_sequence_number	[ISODCL (29, 32)]; /* 723 */
 	__u8 name_len			[ISODCL (33, 33)]; /* 711 */
-	char name			[0];
+	char name			[];
 } __attribute__((packed));
 
 #define ISOFS_BLOCK_BITS 11
diff --git a/include/uapi/linux/jffs2.h b/include/uapi/linux/jffs2.h
index 784ba0b9690a..637ee4a793cf 100644
--- a/include/uapi/linux/jffs2.h
+++ b/include/uapi/linux/jffs2.h
@@ -123,7 +123,7 @@ struct jffs2_raw_dirent
 	__u8 unused[2];
 	jint32_t node_crc;
 	jint32_t name_crc;
-	__u8 name[0];
+	__u8 name[];
 };
 
 /* The JFFS2 raw inode structure: Used for storage on physical media.  */
@@ -155,7 +155,7 @@ struct jffs2_raw_inode
 	jint16_t flags;	     /* See JFFS2_INO_FLAG_* */
 	jint32_t data_crc;   /* CRC for the (compressed) data.  */
 	jint32_t node_crc;   /* CRC for the raw inode (excluding data)  */
-	__u8 data[0];
+	__u8 data[];
 };
 
 struct jffs2_raw_xattr {
@@ -170,7 +170,7 @@ struct jffs2_raw_xattr {
 	jint16_t value_len;
 	jint32_t data_crc;
 	jint32_t node_crc;
-	__u8 data[0];
+	__u8 data[];
 } __attribute__((packed));
 
 struct jffs2_raw_xref
@@ -196,7 +196,7 @@ struct jffs2_raw_summary
 	jint32_t padded;	/* sum of the size of padding nodes */
 	jint32_t sum_crc;	/* summary information crc */
 	jint32_t node_crc; 	/* node crc */
-	jint32_t sum[0]; 	/* inode summary info */
+	jint32_t sum[]; 	/* inode summary info */
 };
 
 union jffs2_node_union
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 1d0350e44ae3..ed95dba9fa37 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -13,7 +13,7 @@ struct kcov_remote_arg {
 	__u32		area_size;	/* Length of coverage buffer in words */
 	__u32		num_handles;	/* Size of handles array */
 	__aligned_u64	common_handle;
-	__aligned_u64	handles[0];
+	__aligned_u64	handles[];
 };
 
 #define KCOV_REMOTE_MAX_HANDLES		0x100
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..74dc8bafcb9e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -542,7 +542,7 @@ struct kvm_coalesced_mmio {
 
 struct kvm_coalesced_mmio_ring {
 	__u32 first, last;
-	struct kvm_coalesced_mmio coalesced_mmio[0];
+	struct kvm_coalesced_mmio coalesced_mmio[];
 };
 
 #define KVM_COALESCED_MMIO_MAX \
@@ -621,7 +621,7 @@ struct kvm_clear_dirty_log {
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
-	__u8  sigset[0];
+	__u8  sigset[];
 };
 
 /* for KVM_TPR_ACCESS_REPORTING */
@@ -1221,7 +1221,7 @@ struct kvm_irq_routing_entry {
 struct kvm_irq_routing {
 	__u32 nr;
 	__u32 flags;
-	struct kvm_irq_routing_entry entries[0];
+	struct kvm_irq_routing_entry entries[];
 };
 
 #endif
@@ -1341,7 +1341,7 @@ struct kvm_dirty_tlb {
 
 struct kvm_reg_list {
 	__u64 n; /* number of regs */
-	__u64 reg[0];
+	__u64 reg[];
 };
 
 struct kvm_one_reg {
diff --git a/include/uapi/linux/minix_fs.h b/include/uapi/linux/minix_fs.h
index 95dbcb17eacd..8d9ca8b2c357 100644
--- a/include/uapi/linux/minix_fs.h
+++ b/include/uapi/linux/minix_fs.h
@@ -97,11 +97,11 @@ struct minix3_super_block {
 
 struct minix_dir_entry {
 	__u16 inode;
-	char name[0];
+	char name[];
 };
 
 struct minix3_dir_entry {
 	__u32 inode;
-	char name[0];
+	char name[];
 };
 #endif
diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h
index 27a39847d55c..e7401ade6822 100644
--- a/include/uapi/linux/mmc/ioctl.h
+++ b/include/uapi/linux/mmc/ioctl.h
@@ -58,7 +58,7 @@ struct mmc_ioc_cmd {
  */
 struct mmc_ioc_multi_cmd {
 	__u64 num_of_cmds;
-	struct mmc_ioc_cmd cmds[0];
+	struct mmc_ioc_cmd cmds[];
 };
 
 #define MMC_IOC_CMD _IOWR(MMC_BLOCK_MAJOR, 0, struct mmc_ioc_cmd)
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 17e02b64ea2e..73516e263627 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -30,25 +30,25 @@ struct nd_cmd_get_config_data_hdr {
 	__u32 in_offset;
 	__u32 in_length;
 	__u32 status;
-	__u8 out_buf[0];
+	__u8 out_buf[];
 } __packed;
 
 struct nd_cmd_set_config_hdr {
 	__u32 in_offset;
 	__u32 in_length;
-	__u8 in_buf[0];
+	__u8 in_buf[];
 } __packed;
 
 struct nd_cmd_vendor_hdr {
 	__u32 opcode;
 	__u32 in_length;
-	__u8 in_buf[0];
+	__u8 in_buf[];
 } __packed;
 
 struct nd_cmd_vendor_tail {
 	__u32 status;
 	__u32 out_length;
-	__u8 out_buf[0];
+	__u8 out_buf[];
 } __packed;
 
 struct nd_cmd_ars_cap {
@@ -86,7 +86,7 @@ struct nd_cmd_ars_status {
 		__u32 reserved;
 		__u64 err_address;
 		__u64 length;
-	} __packed records[0];
+	} __packed records[];
 } __packed;
 
 struct nd_cmd_clear_error {
diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 1bbea8f0681e..84f622a66a7a 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -29,12 +29,12 @@ struct net_dm_config_entry {
 
 struct net_dm_config_msg {
 	__u32 entries;
-	struct net_dm_config_entry options[0];
+	struct net_dm_config_entry options[];
 };
 
 struct net_dm_alert_msg {
 	__u32 entries;
-	struct net_dm_drop_point points[0];
+	struct net_dm_drop_point points[];
 };
 
 struct net_dm_user_msg {
diff --git a/include/uapi/linux/netfilter/x_tables.h b/include/uapi/linux/netfilter/x_tables.h
index b8c6bb233ac1..796af83a963a 100644
--- a/include/uapi/linux/netfilter/x_tables.h
+++ b/include/uapi/linux/netfilter/x_tables.h
@@ -28,7 +28,7 @@ struct xt_entry_match {
 		__u16 match_size;
 	} u;
 
-	unsigned char data[0];
+	unsigned char data[];
 };
 
 struct xt_entry_target {
@@ -119,7 +119,7 @@ struct xt_counters_info {
 	unsigned int num_counters;
 
 	/* The counters (actually `number' of these). */
-	struct xt_counters counters[0];
+	struct xt_counters counters[];
 };
 
 #define XT_INV_PROTO		0x40	/* Invert the sense of PROTO. */
diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h
index bbf5af2b67a8..a6ac2463f787 100644
--- a/include/uapi/linux/netfilter_arp/arp_tables.h
+++ b/include/uapi/linux/netfilter_arp/arp_tables.h
@@ -109,7 +109,7 @@ struct arpt_entry
 	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 /*
@@ -181,7 +181,7 @@ struct arpt_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct arpt_entry entries[0];
+	struct arpt_entry entries[];
 };
 
 /* The argument to ARPT_SO_GET_ENTRIES. */
@@ -193,7 +193,7 @@ struct arpt_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct arpt_entry entrytable[0];
+	struct arpt_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h
index 9acf757bc1f7..73b26a280c4f 100644
--- a/include/uapi/linux/netfilter_bridge/ebt_among.h
+++ b/include/uapi/linux/netfilter_bridge/ebt_among.h
@@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple {
 struct ebt_mac_wormhash {
 	int table[257];
 	int poolsize;
-	struct ebt_mac_wormhash_tuple pool[0];
+	struct ebt_mac_wormhash_tuple pool[];
 };
 
 #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \
diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h
index 50c7fee625ae..1485df28b239 100644
--- a/include/uapi/linux/netfilter_ipv4/ip_tables.h
+++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h
@@ -121,7 +121,7 @@ struct ipt_entry {
 	struct xt_counters counters;
 
 	/* The matches (if any), then the target. */
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 /*
@@ -203,7 +203,7 @@ struct ipt_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct ipt_entry entries[0];
+	struct ipt_entry entries[];
 };
 
 /* The argument to IPT_SO_GET_ENTRIES. */
@@ -215,7 +215,7 @@ struct ipt_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct ipt_entry entrytable[0];
+	struct ipt_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
index d9e364f96a5c..766e8e0bcc68 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
@@ -243,7 +243,7 @@ struct ip6t_replace {
 	struct xt_counters __user *counters;
 
 	/* The entries (hang off end: not really an array). */
-	struct ip6t_entry entries[0];
+	struct ip6t_entry entries[];
 };
 
 /* The argument to IP6T_SO_GET_ENTRIES. */
@@ -255,7 +255,7 @@ struct ip6t_get_entries {
 	unsigned int size;
 
 	/* The entries. */
-	struct ip6t_entry entrytable[0];
+	struct ip6t_entry entrytable[];
 };
 
 /* Helper functions */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..4653834f078f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -491,7 +491,7 @@ struct perf_event_query_bpf {
 	/*
 	 * User provided buffer to store program ids
 	 */
-	__u32	ids[0];
+	__u32	ids[];
 };
 
 /*
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9a2ee1e39fad..ffbe230ef90b 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -256,7 +256,7 @@ struct tc_u32_sel {
 
 	short			hoff;
 	__be32			hmask;
-	struct tc_u32_key	keys[0];
+	struct tc_u32_key	keys[];
 };
 
 struct tc_u32_mark {
@@ -268,7 +268,7 @@ struct tc_u32_mark {
 struct tc_u32_pcnt {
 	__u64 rcnt;
 	__u64 rhit;
-	__u64 kcnts[0];
+	__u64 kcnts[];
 };
 
 /* Flags */
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index e5a98a16f9b0..6c0aa577730f 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -303,7 +303,7 @@ struct mdp_superblock_1 {
 	 * into the 'roles' value.  If a device is spare or faulty, then it doesn't
 	 * have a meaningful role.
 	 */
-	__le16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
+	__le16	dev_roles[];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
 };
 
 /* feature_map bits */
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index dcc1b3e6106f..e744c23582eb 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -41,7 +41,7 @@
 struct rand_pool_info {
 	int	entropy_count;
 	int	buf_size;
-	__u32	buf[0];
+	__u32	buf[];
 };
 
 /*
diff --git a/include/uapi/linux/romfs_fs.h b/include/uapi/linux/romfs_fs.h
index a7f1585accef..6aa05e792454 100644
--- a/include/uapi/linux/romfs_fs.h
+++ b/include/uapi/linux/romfs_fs.h
@@ -27,7 +27,7 @@ struct romfs_super_block {
 	__be32 word1;
 	__be32 size;
 	__be32 checksum;
-	char name[0];		/* volume name */
+	char name[];		/* volume name */
 };
 
 /* On disk inode */
@@ -37,7 +37,7 @@ struct romfs_inode {
 	__be32 spec;
 	__be32 size;
 	__be32 checksum;
-	char name[0];
+	char name[];
 };
 
 #define ROMFH_TYPE 7
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 83849a37db5b..eb2747d58a81 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -440,7 +440,7 @@ struct rtnexthop {
 /* RTA_VIA */
 struct rtvia {
 	__kernel_sa_family_t	rtvia_family;
-	__u8			rtvia_addr[0];
+	__u8			rtvia_addr[];
 };
 
 /* RTM_CACHEINFO */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c4ff1ebd8bcc..ed7d4ecbf53d 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -365,7 +365,7 @@ struct sctp_assoc_change {
 	__u16 sac_outbound_streams;
 	__u16 sac_inbound_streams;
 	sctp_assoc_t sac_assoc_id;
-	__u8 sac_info[0];
+	__u8 sac_info[];
 };
 
 /*
@@ -436,7 +436,7 @@ struct sctp_remote_error {
 	__u32 sre_length;
 	__be16 sre_error;
 	sctp_assoc_t sre_assoc_id;
-	__u8 sre_data[0];
+	__u8 sre_data[];
 };
 
 
@@ -453,7 +453,7 @@ struct sctp_send_failed {
 	__u32 ssf_error;
 	struct sctp_sndrcvinfo ssf_info;
 	sctp_assoc_t ssf_assoc_id;
-	__u8 ssf_data[0];
+	__u8 ssf_data[];
 };
 
 struct sctp_send_failed_event {
@@ -463,7 +463,7 @@ struct sctp_send_failed_event {
 	__u32 ssf_error;
 	struct sctp_sndinfo ssfe_info;
 	sctp_assoc_t ssf_assoc_id;
-	__u8 ssf_data[0];
+	__u8 ssf_data[];
 };
 
 /*
@@ -1029,7 +1029,7 @@ struct sctp_getaddrs_old {
 struct sctp_getaddrs {
 	sctp_assoc_t		assoc_id; /*input*/
 	__u32			addr_num; /*output*/
-	__u8			addrs[0]; /*output, variable size*/
+	__u8			addrs[]; /*output, variable size*/
 };
 
 /* A socket user request obtained via SCTP_GET_ASSOC_STATS that retrieves
diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
index 286e8d6a8e98..13bcbc8bba32 100644
--- a/include/uapi/linux/seg6.h
+++ b/include/uapi/linux/seg6.h
@@ -30,7 +30,7 @@ struct ipv6_sr_hdr {
 	__u8	flags;
 	__u16	tag;
 
-	struct in6_addr segments[0];
+	struct in6_addr segments[];
 };
 
 #define SR6_FLAG1_PROTECTED	(1 << 6)
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..a74294211290 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -26,7 +26,7 @@ enum {
 
 struct seg6_iptunnel_encap {
 	int mode;
-	struct ipv6_sr_hdr srh[0];
+	struct ipv6_sr_hdr srh[];
 };
 
 #define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3))
diff --git a/include/uapi/linux/stm.h b/include/uapi/linux/stm.h
index 7bac318b4440..de3579c2cff0 100644
--- a/include/uapi/linux/stm.h
+++ b/include/uapi/linux/stm.h
@@ -36,7 +36,7 @@ struct stp_policy_id {
 	/* padding */
 	__u16		__reserved_0;
 	__u32		__reserved_1;
-	char		id[0];
+	char		id[];
 };
 
 #define STP_POLICY_ID_SET	_IOWR('%', 0, struct stp_policy_id)
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 27ace512babd..fbd8ca67e107 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -152,7 +152,7 @@ struct tcmu_tmr_entry {
 	__u32 cmd_cnt;
 	__u64 __pad3;
 	__u64 __pad4;
-	__u16 cmd_ids[0];
+	__u16 cmd_ids[];
 } __packed;
 
 #define TCMU_OP_ALIGN_SIZE sizeof(__u64)
diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h
index 76b7c3f6cd0d..c917c53070d5 100644
--- a/include/uapi/linux/usb/audio.h
+++ b/include/uapi/linux/usb/audio.h
@@ -341,7 +341,7 @@ struct uac_feature_unit_descriptor {
 	__u8 bUnitID;
 	__u8 bSourceID;
 	__u8 bControlSize;
-	__u8 bmaControls[0]; /* variable length */
+	__u8 bmaControls[]; /* variable length */
 } __attribute__((packed));
 
 static inline __u8 uac_feature_unit_iFeature(struct uac_feature_unit_descriptor *desc)
diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 6d61550959ef..acf3852bb676 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -171,7 +171,7 @@ struct usb_cdc_mdlm_detail_desc {
 
 	/* type is associated with mdlm_desc.bGUID */
 	__u8	bGuidDescriptorType;
-	__u8	bDetailData[0];
+	__u8	bDetailData[];
 } __attribute__ ((packed));
 
 /* "OBEX Control Model Functional Descriptor" */
@@ -379,7 +379,7 @@ struct usb_cdc_ncm_ndp16 {
 	__le32	dwSignature;
 	__le16	wLength;
 	__le16	wNextNdpIndex;
-	struct	usb_cdc_ncm_dpe16 dpe16[0];
+	struct	usb_cdc_ncm_dpe16 dpe16[];
 } __attribute__ ((packed));
 
 /* 32-bit NCM Datagram Pointer Entry */
@@ -395,7 +395,7 @@ struct usb_cdc_ncm_ndp32 {
 	__le16	wReserved6;
 	__le32	dwNextNdpIndex;
 	__le32	dwReserved12;
-	struct	usb_cdc_ncm_dpe32 dpe32[0];
+	struct	usb_cdc_ncm_dpe32 dpe32[];
 } __attribute__ ((packed));
 
 /* CDC NCM subclass 3.2.1 and 3.2.2 */
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 17ce56198c9a..31fcfa084e63 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -818,7 +818,7 @@ struct usb_key_descriptor {
 
 	__u8  tTKID[3];
 	__u8  bReserved;
-	__u8  bKeyData[0];
+	__u8  bKeyData[];
 } __attribute__((packed));
 
 /*-------------------------------------------------------------------------*/
diff --git a/include/uapi/linux/usb/raw_gadget.h b/include/uapi/linux/usb/raw_gadget.h
index 0be685272eb1..c7d2199134d7 100644
--- a/include/uapi/linux/usb/raw_gadget.h
+++ b/include/uapi/linux/usb/raw_gadget.h
@@ -60,7 +60,7 @@ enum usb_raw_event_type {
 struct usb_raw_event {
 	__u32		type;
 	__u32		length;
-	__u8		data[0];
+	__u8		data[];
 };
 
 #define USB_RAW_IO_FLAGS_ZERO	0x0001
@@ -90,7 +90,7 @@ struct usb_raw_ep_io {
 	__u16		ep;
 	__u16		flags;
 	__u32		length;
-	__u8		data[0];
+	__u8		data[];
 };
 
 /* Maximum number of non-control endpoints in struct usb_raw_eps_info. */
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index cf525cddeb94..74a84e02422a 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -131,7 +131,7 @@ struct usbdevfs_urb {
 	unsigned int signr;	/* signal to be sent on completion,
 				  or 0 if none should be sent. */
 	void __user *usercontext;
-	struct usbdevfs_iso_packet_desc iso_frame_desc[0];
+	struct usbdevfs_iso_packet_desc iso_frame_desc[];
 };
 
 /* ioctls for talking directly to drivers */
@@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim {
 struct usbdevfs_streams {
 	unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */
 	unsigned int num_eps;
-	unsigned char eps[0];
+	unsigned char eps[];
 };
 
 /*
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 634cee485abb..391331a10879 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -107,7 +107,7 @@ struct vhost_memory_region {
 struct vhost_memory {
 	__u32 nregions;
 	__u32 padding;
-	struct vhost_memory_region regions[0];
+	struct vhost_memory_region regions[];
 };
 
 /* VHOST_SCSI specific definitions */
@@ -135,7 +135,7 @@ struct vhost_scsi_target {
 struct vhost_vdpa_config {
 	__u32 off;
 	__u32 len;
-	__u8 buf[0];
+	__u8 buf[];
 };
 
 /* vhost vdpa IOVA range
diff --git a/include/uapi/linux/virtio_9p.h b/include/uapi/linux/virtio_9p.h
index 441047432258..374b68f8ac6e 100644
--- a/include/uapi/linux/virtio_9p.h
+++ b/include/uapi/linux/virtio_9p.h
@@ -38,7 +38,7 @@ struct virtio_9p_config {
 	/* length of the tag name */
 	__virtio16 tag_len;
 	/* non-NULL terminated tag name */
-	__u8 tag[0];
+	__u8 tag[];
 } __attribute__((packed));
 
 #endif /* _LINUX_VIRTIO_9P_H */
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..e8191e0c3b56 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -33,7 +33,7 @@ struct xfrm_sec_ctx {
 	__u8	ctx_alg;
 	__u16	ctx_len;
 	__u32	ctx_sid;
-	char	ctx_str[0];
+	char	ctx_str[];
 };
 
 /* Security Context Domains of Interpretation */
@@ -96,27 +96,27 @@ struct xfrm_replay_state_esn {
 	__u32		oseq_hi;
 	__u32		seq_hi;
 	__u32		replay_window;
-	__u32		bmp[0];
+	__u32		bmp[];
 };
 
 struct xfrm_algo {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_auth {
 	char		alg_name[64];
 	unsigned int	alg_key_len;    /* in bits */
 	unsigned int	alg_trunc_len;  /* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_algo_aead {
 	char		alg_name[64];
 	unsigned int	alg_key_len;	/* in bits */
 	unsigned int	alg_icv_len;	/* in bits */
-	char		alg_key[0];
+	char		alg_key[];
 };
 
 struct xfrm_stats {
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index d95ef9a2b032..1106a7c90b29 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -180,7 +180,7 @@ struct hfi1_sdma_comp_entry {
 struct hfi1_status {
 	__aligned_u64 dev;      /* device/hw status bits */
 	__aligned_u64 port;     /* port state and status bits */
-	char freezemsg[0];
+	char freezemsg[];
 };
 
 enum sdma_req_opcode {
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 7dd903d932e5..43672cb1fd57 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -158,18 +158,18 @@ struct ib_uverbs_ex_cmd_hdr {
 
 struct ib_uverbs_get_context {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_get_context_resp {
 	__u32 async_fd;
 	__u32 num_comp_vectors;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_device {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_device_resp {
@@ -278,7 +278,7 @@ struct ib_uverbs_query_port {
 	__aligned_u64 response;
 	__u8  port_num;
 	__u8  reserved[7];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_port_resp {
@@ -308,12 +308,12 @@ struct ib_uverbs_query_port_resp {
 
 struct ib_uverbs_alloc_pd {
 	__aligned_u64 response;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_alloc_pd_resp {
 	__u32 pd_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_dealloc_pd {
@@ -324,12 +324,12 @@ struct ib_uverbs_open_xrcd {
 	__aligned_u64 response;
 	__u32 fd;
 	__u32 oflags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_open_xrcd_resp {
 	__u32 xrcd_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_close_xrcd {
@@ -343,14 +343,14 @@ struct ib_uverbs_reg_mr {
 	__aligned_u64 hca_va;
 	__u32 pd_handle;
 	__u32 access_flags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_reg_mr_resp {
 	__u32 mr_handle;
 	__u32 lkey;
 	__u32 rkey;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_rereg_mr {
@@ -362,13 +362,13 @@ struct ib_uverbs_rereg_mr {
 	__aligned_u64 hca_va;
 	__u32 pd_handle;
 	__u32 access_flags;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_rereg_mr_resp {
 	__u32 lkey;
 	__u32 rkey;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_dereg_mr {
@@ -380,13 +380,13 @@ struct ib_uverbs_alloc_mw {
 	__u32 pd_handle;
 	__u8  mw_type;
 	__u8  reserved[3];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_alloc_mw_resp {
 	__u32 mw_handle;
 	__u32 rkey;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_dealloc_mw {
@@ -408,7 +408,7 @@ struct ib_uverbs_create_cq {
 	__u32 comp_vector;
 	__s32 comp_channel;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 enum ib_uverbs_ex_create_cq_flags {
@@ -442,13 +442,13 @@ struct ib_uverbs_resize_cq {
 	__aligned_u64 response;
 	__u32 cq_handle;
 	__u32 cqe;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_resize_cq_resp {
 	__u32 cqe;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_poll_cq {
@@ -492,7 +492,7 @@ struct ib_uverbs_wc {
 struct ib_uverbs_poll_cq_resp {
 	__u32 count;
 	__u32 reserved;
-	struct ib_uverbs_wc wc[0];
+	struct ib_uverbs_wc wc[];
 };
 
 struct ib_uverbs_req_notify_cq {
@@ -585,7 +585,7 @@ struct ib_uverbs_create_qp {
 	__u8  qp_type;
 	__u8  is_srq;
 	__u8  reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 enum ib_uverbs_create_qp_mask {
@@ -624,7 +624,7 @@ struct ib_uverbs_open_qp {
 	__u32 qpn;
 	__u8  qp_type;
 	__u8  reserved[7];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 /* also used for open response */
@@ -669,7 +669,7 @@ struct ib_uverbs_query_qp {
 	__aligned_u64 response;
 	__u32 qp_handle;
 	__u32 attr_mask;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_qp_resp {
@@ -703,7 +703,7 @@ struct ib_uverbs_query_qp_resp {
 	__u8  alt_timeout;
 	__u8  sq_sig_all;
 	__u8  reserved[5];
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_modify_qp {
@@ -824,7 +824,7 @@ struct ib_uverbs_post_send {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_send_wr send_wr[0];
+	struct ib_uverbs_send_wr send_wr[];
 };
 
 struct ib_uverbs_post_send_resp {
@@ -843,7 +843,7 @@ struct ib_uverbs_post_recv {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_recv_wr recv_wr[0];
+	struct ib_uverbs_recv_wr recv_wr[];
 };
 
 struct ib_uverbs_post_recv_resp {
@@ -856,7 +856,7 @@ struct ib_uverbs_post_srq_recv {
 	__u32 wr_count;
 	__u32 sge_count;
 	__u32 wqe_size;
-	struct ib_uverbs_recv_wr recv[0];
+	struct ib_uverbs_recv_wr recv[];
 };
 
 struct ib_uverbs_post_srq_recv_resp {
@@ -869,12 +869,12 @@ struct ib_uverbs_create_ah {
 	__u32 pd_handle;
 	__u32 reserved;
 	struct ib_uverbs_ah_attr attr;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_ah_resp {
 	__u32 ah_handle;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_destroy_ah {
@@ -886,7 +886,7 @@ struct ib_uverbs_attach_mcast {
 	__u32 qp_handle;
 	__u16 mlid;
 	__u16 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_detach_mcast {
@@ -894,7 +894,7 @@ struct ib_uverbs_detach_mcast {
 	__u32 qp_handle;
 	__u16 mlid;
 	__u16 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_flow_spec_hdr {
@@ -1135,7 +1135,7 @@ struct ib_uverbs_flow_attr {
 	 * struct ib_flow_spec_xxx
 	 * struct ib_flow_spec_yyy
 	 */
-	struct ib_uverbs_flow_spec_hdr flow_specs[0];
+	struct ib_uverbs_flow_spec_hdr flow_specs[];
 };
 
 struct ib_uverbs_create_flow  {
@@ -1161,7 +1161,7 @@ struct ib_uverbs_create_srq {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srq_limit;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_xsrq {
@@ -1175,7 +1175,7 @@ struct ib_uverbs_create_xsrq {
 	__u32 max_num_tags;
 	__u32 xrcd_handle;
 	__u32 cq_handle;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_create_srq_resp {
@@ -1183,7 +1183,7 @@ struct ib_uverbs_create_srq_resp {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srqn;
-	__u32 driver_data[0];
+	__u32 driver_data[];
 };
 
 struct ib_uverbs_modify_srq {
@@ -1191,14 +1191,14 @@ struct ib_uverbs_modify_srq {
 	__u32 attr_mask;
 	__u32 max_wr;
 	__u32 srq_limit;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_srq {
 	__aligned_u64 response;
 	__u32 srq_handle;
 	__u32 reserved;
-	__aligned_u64 driver_data[0];
+	__aligned_u64 driver_data[];
 };
 
 struct ib_uverbs_query_srq_resp {
@@ -1269,7 +1269,7 @@ struct ib_uverbs_ex_create_rwq_ind_table  {
 	 * wq_handle1
 	 * wq_handle2
 	 */
-	__u32 wq_handles[0];
+	__u32 wq_handles[];
 };
 
 struct ib_uverbs_ex_create_rwq_ind_table_resp {
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index ed5a514305c1..7cea03581f79 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -184,7 +184,7 @@ struct rdma_ucm_query_addr_resp {
 struct rdma_ucm_query_path_resp {
 	__u32 num_paths;
 	__u32 reserved;
-	struct ib_path_rec_data path_data[0];
+	struct ib_path_rec_data path_data[];
 };
 
 struct rdma_ucm_conn_param {
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
index 38ab7accb7be..ab1aef17feb1 100644
--- a/include/uapi/rdma/rdma_user_ioctl_cmds.h
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -81,7 +81,7 @@ struct ib_uverbs_ioctl_hdr {
 	__aligned_u64 reserved1;
 	__u32 driver_id;
 	__u32 reserved2;
-	struct ib_uverbs_attr  attrs[0];
+	struct ib_uverbs_attr  attrs[];
 };
 
 #endif
diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h
index c9812c5c2fc4..16782c360de3 100644
--- a/include/uapi/scsi/fc/fc_els.h
+++ b/include/uapi/scsi/fc/fc_els.h
@@ -264,7 +264,7 @@ struct fc_tlv_desc {
 					 * Size of descriptor excluding
 					 * desc_tag and desc_len fields.
 					 */
-	__u8		desc_value[0];  /* Descriptor Value */
+	__u8		desc_value[];  /* Descriptor Value */
 };
 
 /* Descriptor tag and len fields are considered the mandatory header
@@ -1027,7 +1027,7 @@ struct fc_fn_li_desc {
 					 * threshold to caause the LI event
 					 */
 	__be32		pname_count;	/* number of portname_list elements */
-	__be64		pname_list[0];	/* list of N_Port_Names accessible
+	__be64		pname_list[];	/* list of N_Port_Names accessible
 					 * through the attached port
 					 */
 };
@@ -1069,7 +1069,7 @@ struct fc_fn_peer_congn_desc {
 					 * congestion event
 					 */
 	__be32		pname_count;	/* number of portname_list elements */
-	__be64		pname_list[0];	/* list of N_Port_Names accessible
+	__be64		pname_list[];	/* list of N_Port_Names accessible
 					 * through the attached port
 					 */
 };
@@ -1104,7 +1104,7 @@ struct fc_els_fpin {
 					 * Size of ELS excluding fpin_cmd,
 					 * fpin_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	fpin_desc[0];	/* Descriptor list */
+	struct fc_tlv_desc	fpin_desc[];	/* Descriptor list */
 };
 
 /* Diagnostic Function Descriptor - FPIN Registration */
@@ -1115,7 +1115,7 @@ struct fc_df_desc_fpin_reg {
 					 * desc_tag and desc_len fields.
 					 */
 	__be32		count;		/* Number of desc_tags elements */
-	__be32		desc_tags[0];	/* Array of Descriptor Tags.
+	__be32		desc_tags[];	/* Array of Descriptor Tags.
 					 * Each tag indicates a function
 					 * supported by the N_Port (request)
 					 * or by the  N_Port and Fabric
@@ -1135,7 +1135,7 @@ struct fc_els_rdf {
 					 * Size of ELS excluding fpin_cmd,
 					 * fpin_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	desc[0];	/* Descriptor list */
+	struct fc_tlv_desc	desc[];	/* Descriptor list */
 };
 
 /*
@@ -1148,7 +1148,7 @@ struct fc_els_rdf_resp {
 						 * and desc_list_len fields.
 						 */
 	struct fc_els_lsri_desc	lsri;
-	struct fc_tlv_desc	desc[0];	/* Supported Descriptor list */
+	struct fc_tlv_desc	desc[];	/* Supported Descriptor list */
 };
 
 
@@ -1231,7 +1231,7 @@ struct fc_els_edc {
 					 * Size of ELS excluding edc_cmd,
 					 * edc_zero and desc_len fields.
 					 */
-	struct fc_tlv_desc	desc[0];
+	struct fc_tlv_desc	desc[];
 					/* Diagnostic Descriptor list */
 };
 
@@ -1245,7 +1245,7 @@ struct fc_els_edc_resp {
 						 * and desc_list_len fields.
 						 */
 	struct fc_els_lsri_desc	lsri;
-	struct fc_tlv_desc	desc[0];
+	struct fc_tlv_desc	desc[];
 				    /* Supported Diagnostic Descriptor list */
 };
 
diff --git a/include/uapi/scsi/scsi_bsg_fc.h b/include/uapi/scsi/scsi_bsg_fc.h
index 3ae65e93235c..7f5930801f72 100644
--- a/include/uapi/scsi/scsi_bsg_fc.h
+++ b/include/uapi/scsi/scsi_bsg_fc.h
@@ -209,7 +209,7 @@ struct fc_bsg_host_vendor {
 	__u64 vendor_id;
 
 	/* start of vendor command area */
-	__u32 vendor_cmd[0];
+	__u32 vendor_cmd[];
 };
 
 /* Response:
diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h
index 2d3e5df39a59..3974a2a911cc 100644
--- a/include/uapi/sound/asound.h
+++ b/include/uapi/sound/asound.h
@@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value {
 struct snd_ctl_tlv {
 	unsigned int numid;	/* control element numeric identification */
 	unsigned int length;	/* in bytes aligned to 4 */
-	unsigned int tlv[0];	/* first TLV */
+	unsigned int tlv[];	/* first TLV */
 };
 
 #define SNDRV_CTL_IOCTL_PVERSION	_IOR('U', 0x00, int)
diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h
index 39cf6eb75940..3532ac7046d7 100644
--- a/include/uapi/sound/firewire.h
+++ b/include/uapi/sound/firewire.h
@@ -38,11 +38,11 @@ struct snd_efw_transaction {
 	__be32 category;
 	__be32 command;
 	__be32 status;
-	__be32 params[0];
+	__be32 params[];
 };
 struct snd_firewire_event_efw_response {
 	unsigned int type;
-	__be32 response[0];	/* some responses */
+	__be32 response[];	/* some responses */
 };
 
 struct snd_firewire_event_digi00x_message {
@@ -63,7 +63,7 @@ struct snd_firewire_tascam_change {
 
 struct snd_firewire_event_tascam_control {
 	unsigned int type;
-	struct snd_firewire_tascam_change changes[0];
+	struct snd_firewire_tascam_change changes[];
 };
 
 struct snd_firewire_event_motu_register_dsp_change {
diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h
index a93c0decfdd5..f29899b179a6 100644
--- a/include/uapi/sound/skl-tplg-interface.h
+++ b/include/uapi/sound/skl-tplg-interface.h
@@ -151,7 +151,7 @@ struct skl_dfw_algo_data {
 	__u32 rsvd:30;
 	__u32 param_id;
 	__u32 max;
-	char params[0];
+	char params[];
 } __packed;
 
 enum skl_tkn_dir {
diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index 5f4518e7a972..dbf137516522 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -23,7 +23,7 @@ struct sof_abi_hdr {
 	__u32 size;		/**< size in bytes of data excl. this struct */
 	__u32 abi;		/**< SOF ABI version */
 	__u32 reserved[4];	/**< reserved for future use */
-	__u32 data[0];		/**< Component data - opaque to core */
+	__u32 data[];		/**< Component data - opaque to core */
 }  __packed;
 
 #endif
diff --git a/include/uapi/sound/usb_stream.h b/include/uapi/sound/usb_stream.h
index 95419d8bbc16..ffdd3ea1e31d 100644
--- a/include/uapi/sound/usb_stream.h
+++ b/include/uapi/sound/usb_stream.h
@@ -61,7 +61,7 @@ struct usb_stream {
 	unsigned		 inpacket_split_at;
 	unsigned		 next_inpacket_split;
 	unsigned		 next_inpacket_split_at;
-	struct usb_stream_packet inpacket[0];
+	struct usb_stream_packet inpacket[];
 };
 
 enum usb_stream_state {
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index bf6e96011dfe..e135f4dcb19d 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -198,13 +198,13 @@ struct kvm_msrs {
 	__u32 nmsrs; /* number of msrs in entries */
 	__u32 pad;
 
-	struct kvm_msr_entry entries[0];
+	struct kvm_msr_entry entries[];
 };
 
 /* for KVM_GET_MSR_INDEX_LIST */
 struct kvm_msr_list {
 	__u32 nmsrs; /* number of msrs in entries */
-	__u32 indices[0];
+	__u32 indices[];
 };
 
 /* Maximum size of any access bitmap in bytes */
@@ -241,7 +241,7 @@ struct kvm_cpuid_entry {
 struct kvm_cpuid {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry entries[0];
+	struct kvm_cpuid_entry entries[];
 };
 
 struct kvm_cpuid_entry2 {
@@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 {
 struct kvm_cpuid2 {
 	__u32 nent;
 	__u32 padding;
-	struct kvm_cpuid_entry2 entries[0];
+	struct kvm_cpuid_entry2 entries[];
 };
 
 /* for KVM_GET_PIT and KVM_SET_PIT */
@@ -389,7 +389,7 @@ struct kvm_xsave {
 	 * the contents of CPUID leaf 0xD on the host.
 	 */
 	__u32 region[1024];
-	__u32 extra[0];
+	__u32 extra[];
 };
 
 #define KVM_MAX_XCRS	16
@@ -515,7 +515,7 @@ struct kvm_pmu_event_filter {
 	__u32 fixed_counter_bitmap;
 	__u32 flags;
 	__u32 pad[4];
-	__u64 events[0];
+	__u64 events[];
 };
 
 #define KVM_PMU_EVENT_ALLOW 0
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
index 05c3642aaece..239b91b13c60 100644
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -2060,7 +2060,7 @@ struct i915_context_engines_load_balance {
 
 	__u64 mbz64; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \
@@ -2098,7 +2098,7 @@ struct i915_context_engines_bond {
 	__u64 flags; /* all undefined flags must be zero */
 	__u64 mbz64[4]; /* reserved for future use; must be zero */
 
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_ENGINES_BOND(name__, N__) struct { \
@@ -2225,7 +2225,7 @@ struct i915_context_engines_parallel_submit {
 	 * length = width (i) * num_siblings (j)
 	 * index = j + i * num_siblings
 	 */
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 
 } __packed;
 
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index bdf7b404b3e7..b7b56871029c 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -90,7 +90,7 @@ struct file_dedupe_range {
 	__u16 dest_count;	/* in - total elements in info array */
 	__u16 reserved1;	/* must be zero */
 	__u32 reserved2;	/* must be zero */
-	struct file_dedupe_range_info info[0];
+	struct file_dedupe_range_info info[];
 };
 
 /* And dynamically-tunable limits and defaults: */
diff --git a/tools/include/uapi/linux/if_tun.h b/tools/include/uapi/linux/if_tun.h
index 454ae31b93c7..2ec07de1d73b 100644
--- a/tools/include/uapi/linux/if_tun.h
+++ b/tools/include/uapi/linux/if_tun.h
@@ -108,7 +108,7 @@ struct tun_pi {
 struct tun_filter {
 	__u16  flags; /* TUN_FLT_ flags see above */
 	__u16  count; /* Number of addresses */
-	__u8   addr[0][ETH_ALEN];
+	__u8   addr[][ETH_ALEN];
 };
 
 #endif /* _UAPI__IF_TUN_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 6a184d260c7f..37ce8cbac322 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -539,7 +539,7 @@ struct kvm_coalesced_mmio {
 
 struct kvm_coalesced_mmio_ring {
 	__u32 first, last;
-	struct kvm_coalesced_mmio coalesced_mmio[0];
+	struct kvm_coalesced_mmio coalesced_mmio[];
 };
 
 #define KVM_COALESCED_MMIO_MAX \
@@ -618,7 +618,7 @@ struct kvm_clear_dirty_log {
 /* for KVM_SET_SIGNAL_MASK */
 struct kvm_signal_mask {
 	__u32 len;
-	__u8  sigset[0];
+	__u8  sigset[];
 };
 
 /* for KVM_TPR_ACCESS_REPORTING */
@@ -1216,7 +1216,7 @@ struct kvm_irq_routing_entry {
 struct kvm_irq_routing {
 	__u32 nr;
 	__u32 flags;
-	struct kvm_irq_routing_entry entries[0];
+	struct kvm_irq_routing_entry entries[];
 };
 
 #endif
@@ -1335,7 +1335,7 @@ struct kvm_dirty_tlb {
 
 struct kvm_reg_list {
 	__u64 n; /* number of regs */
-	__u64 reg[0];
+	__u64 reg[];
 };
 
 struct kvm_one_reg {
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d37629dbad72..4653834f078f 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -491,7 +491,7 @@ struct perf_event_query_bpf {
 	/*
 	 * User provided buffer to store program ids
 	 */
-	__u32	ids[0];
+	__u32	ids[];
 };
 
 /*
diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h
index 12153771396a..3faee0199a9b 100644
--- a/tools/include/uapi/linux/pkt_cls.h
+++ b/tools/include/uapi/linux/pkt_cls.h
@@ -180,7 +180,7 @@ struct tc_u32_sel {
 
 	short			hoff;
 	__be32			hmask;
-	struct tc_u32_key	keys[0];
+	struct tc_u32_key	keys[];
 };
 
 struct tc_u32_mark {
@@ -192,7 +192,7 @@ struct tc_u32_mark {
 struct tc_u32_pcnt {
 	__u64 rcnt;
 	__u64 rhit;
-	__u64 kcnts[0];
+	__u64 kcnts[];
 };
 
 /* Flags */
diff --git a/tools/include/uapi/linux/seg6.h b/tools/include/uapi/linux/seg6.h
index 286e8d6a8e98..f94baf154c47 100644
--- a/tools/include/uapi/linux/seg6.h
+++ b/tools/include/uapi/linux/seg6.h
@@ -30,7 +30,7 @@ struct ipv6_sr_hdr {
 	__u8	flags;
 	__u16	tag;
 
-	struct in6_addr segments[0];
+	struct in6_addr segments[];
 };
 
 #define SR6_FLAG1_PROTECTED	(1 << 6)
@@ -49,7 +49,7 @@ struct ipv6_sr_hdr {
 struct sr6_tlv {
 	__u8 type;
 	__u8 len;
-	__u8 data[0];
+	__u8 data[];
 };
 
 #endif
diff --git a/tools/include/uapi/linux/usbdevice_fs.h b/tools/include/uapi/linux/usbdevice_fs.h
index cf525cddeb94..74a84e02422a 100644
--- a/tools/include/uapi/linux/usbdevice_fs.h
+++ b/tools/include/uapi/linux/usbdevice_fs.h
@@ -131,7 +131,7 @@ struct usbdevfs_urb {
 	unsigned int signr;	/* signal to be sent on completion,
 				  or 0 if none should be sent. */
 	void __user *usercontext;
-	struct usbdevfs_iso_packet_desc iso_frame_desc[0];
+	struct usbdevfs_iso_packet_desc iso_frame_desc[];
 };
 
 /* ioctls for talking directly to drivers */
@@ -176,7 +176,7 @@ struct usbdevfs_disconnect_claim {
 struct usbdevfs_streams {
 	unsigned int num_streams; /* Not used by USBDEVFS_FREE_STREAMS */
 	unsigned int num_eps;
-	unsigned char eps[0];
+	unsigned char eps[];
 };
 
 /*
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h
index 2d3e5df39a59..3974a2a911cc 100644
--- a/tools/include/uapi/sound/asound.h
+++ b/tools/include/uapi/sound/asound.h
@@ -1106,7 +1106,7 @@ struct snd_ctl_elem_value {
 struct snd_ctl_tlv {
 	unsigned int numid;	/* control element numeric identification */
 	unsigned int length;	/* in bytes aligned to 4 */
-	unsigned int tlv[0];	/* first TLV */
+	unsigned int tlv[];	/* first TLV */
 };
 
 #define SNDRV_CTL_IOCTL_PVERSION	_IOR('U', 0x00, int)
-- 
cgit v1.2.3


From 06e445f740c1a0fe5d16b3dff8a4ef18e124e54e Mon Sep 17 00:00:00 2001
From: Ossama Othman <ossama.othman@intel.com>
Date: Mon, 27 Jun 2022 18:02:42 -0700
Subject: mptcp: fix conflict with <netinet/in.h>

Including <linux/mptcp.h> before the C library <netinet/in.h> header
causes symbol redefinition errors at compile-time due to duplicate
declarations and definitions in the <linux/in.h> header included by
<linux/mptcp.h>.

Explicitly include <netinet/in.h> before <linux/in.h> in
<linux/mptcp.h> when __KERNEL__ is not defined so that the C library
compatibility logic in <linux/libc-compat.h> is enabled when including
<linux/mptcp.h> in user space code.

Fixes: c11c5906bc0a ("mptcp: add MPTCP_SUBFLOW_ADDRS getsockopt support")
Signed-off-by: Ossama Othman <ossama.othman@intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 921963589904..dfe19bf13f4c 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -2,16 +2,17 @@
 #ifndef _UAPI_MPTCP_H
 #define _UAPI_MPTCP_H
 
+#ifndef __KERNEL__
+#include <netinet/in.h>		/* for sockaddr_in and sockaddr_in6	*/
+#include <sys/socket.h>		/* for struct sockaddr			*/
+#endif
+
 #include <linux/const.h>
 #include <linux/types.h>
 #include <linux/in.h>		/* for sockaddr_in			*/
 #include <linux/in6.h>		/* for sockaddr_in6			*/
 #include <linux/socket.h>	/* for sockaddr_storage and sa_family	*/
 
-#ifndef __KERNEL__
-#include <sys/socket.h>		/* for struct sockaddr			*/
-#endif
-
 #define MPTCP_SUBFLOW_FLAG_MCAP_REM		_BITUL(0)
 #define MPTCP_SUBFLOW_FLAG_MCAP_LOC		_BITUL(1)
 #define MPTCP_SUBFLOW_FLAG_JOIN_REM		_BITUL(2)
-- 
cgit v1.2.3


From 69fd337a975c7e690dfe49d9cb4fe5ba1e6db44e Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:06 -0700
Subject: bpf: per-cgroup lsm flavor

Allow attaching to lsm hooks in the cgroup context.

Attaching to per-cgroup LSM works exactly like attaching
to other per-cgroup hooks. New BPF_LSM_CGROUP is added
to trigger new mode; the actual lsm hook we attach to is
signaled via existing attach_btf_id.

For the hooks that have 'struct socket' or 'struct sock' as its first
argument, we use the cgroup associated with that socket. For the rest,
we use 'current' cgroup (this is all on default hierarchy == v2 only).
Note that for some hooks that work on 'struct sock' we still
take the cgroup from 'current' because some of them work on the socket
that hasn't been properly initialized yet.

Behind the scenes, we allocate a shim program that is attached
to the trampoline and runs cgroup effective BPF programs array.
This shim has some rudimentary ref counting and can be shared
between several programs attaching to the same lsm hook from
different cgroups.

Note that this patch bloats cgroup size because we add 211
cgroup_bpf_attach_type(s) for simplicity sake. This will be
addressed in the subsequent patch.

Also note that we only add non-sleepable flavor for now. To enable
sleepable use-cases, bpf_prog_run_array_cg has to grab trace rcu,
shim programs have to be freed via trace rcu, cgroup_bpf.effective
should be also trace-rcu-managed + maybe some other changes that
I'm not aware of.

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-4-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c     |  24 +++--
 include/linux/bpf-cgroup-defs.h |   8 ++
 include/linux/bpf-cgroup.h      |   7 ++
 include/linux/bpf.h             |  24 +++++
 include/linux/bpf_lsm.h         |  13 +++
 include/linux/btf_ids.h         |   3 +-
 include/uapi/linux/bpf.h        |   1 +
 kernel/bpf/bpf_lsm.c            |  48 ++++++++++
 kernel/bpf/btf.c                |  11 +++
 kernel/bpf/cgroup.c             | 136 ++++++++++++++++++++++++---
 kernel/bpf/core.c               |   2 +
 kernel/bpf/syscall.c            |  10 ++
 kernel/bpf/trampoline.c         | 198 ++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c           |  32 +++++++
 tools/include/uapi/linux/bpf.h  |   1 +
 15 files changed, 498 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2c51ca9f7cec..2f460c67f9c7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1770,6 +1770,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 			   struct bpf_tramp_link *l, int stack_size,
 			   int run_ctx_off, bool save_ret)
 {
+	void (*exit)(struct bpf_prog *prog, u64 start,
+		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit;
+	u64 (*enter)(struct bpf_prog *prog,
+		     struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter;
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
@@ -1788,15 +1792,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	 */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
 
+	if (p->aux->sleepable) {
+		enter = __bpf_prog_enter_sleepable;
+		exit = __bpf_prog_exit_sleepable;
+	} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
+		enter = __bpf_prog_enter_lsm_cgroup;
+		exit = __bpf_prog_exit_lsm_cgroup;
+	}
+
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
 	/* arg2: lea rsi, [rbp - ctx_cookie_off] */
 	EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
 
-	if (emit_call(&prog,
-		      p->aux->sleepable ? __bpf_prog_enter_sleepable :
-		      __bpf_prog_enter, prog))
-			return -EINVAL;
+	if (emit_call(&prog, enter, prog))
+		return -EINVAL;
 	/* remember prog start time returned by __bpf_prog_enter */
 	emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
 
@@ -1840,10 +1850,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
 	/* arg3: lea rdx, [rbp - run_ctx_off] */
 	EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
-	if (emit_call(&prog,
-		      p->aux->sleepable ? __bpf_prog_exit_sleepable :
-		      __bpf_prog_exit, prog))
-			return -EINVAL;
+	if (emit_call(&prog, exit, prog))
+		return -EINVAL;
 
 	*pprog = prog;
 	return 0;
diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 5d268e76d8e6..b99f8c3e37ea 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -10,6 +10,12 @@
 
 struct bpf_prog_array;
 
+#ifdef CONFIG_BPF_LSM
+#define CGROUP_LSM_NUM 211 /* will be addressed in the next patch */
+#else
+#define CGROUP_LSM_NUM 0
+#endif
+
 enum cgroup_bpf_attach_type {
 	CGROUP_BPF_ATTACH_TYPE_INVALID = -1,
 	CGROUP_INET_INGRESS = 0,
@@ -35,6 +41,8 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
+	CGROUP_LSM_START,
+	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
 	MAX_CGROUP_BPF_ATTACH_TYPE
 };
 
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 6673acfbf2ef..2bd1b5f8de9b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,6 +23,13 @@ struct ctl_table;
 struct ctl_table_header;
 struct task_struct;
 
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+				       const struct bpf_insn *insn);
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+					 const struct bpf_insn *insn);
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+					  const struct bpf_insn *insn);
+
 #ifdef CONFIG_CGROUP_BPF
 
 #define CGROUP_ATYPE(type) \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d547be9db75f..77cd613a00bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -794,6 +794,10 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
 				       struct bpf_tramp_run_ctx *run_ctx);
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx);
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
@@ -1060,6 +1064,7 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	u32 verified_insns;
+	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
@@ -1167,6 +1172,11 @@ struct bpf_tramp_link {
 	u64 cookie;
 };
 
+struct bpf_shim_tramp_link {
+	struct bpf_tramp_link link;
+	struct bpf_trampoline *trampoline;
+};
+
 struct bpf_tracing_link {
 	struct bpf_tramp_link link;
 	enum bpf_attach_type attach_type;
@@ -1245,6 +1255,9 @@ struct bpf_dummy_ops {
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+				    int cgroup_atype);
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
 #else
 static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 {
@@ -1268,6 +1281,14 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+						  int cgroup_atype)
+{
+	return -EOPNOTSUPP;
+}
+static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+}
 #endif
 
 struct bpf_array {
@@ -2368,6 +2389,8 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
 extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
+extern const struct bpf_func_proto bpf_set_retval_proto;
+extern const struct bpf_func_proto bpf_get_retval_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2485,6 +2508,7 @@ int bpf_arch_text_invalidate(void *dst, size_t len);
 
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
+int btf_id_set_index(const struct btf_id_set *set, u32 id);
 
 #define MAX_BPRINTF_VARARGS		12
 
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 479c101546ad..61787a5f6af9 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -42,6 +42,9 @@ extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
 
+int bpf_lsm_hook_idx(u32 btf_id);
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
+
 #else /* !CONFIG_BPF_LSM */
 
 static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
@@ -65,6 +68,16 @@ static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
+static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+					   bpf_func_t *bpf_func)
+{
+}
+
+static inline int bpf_lsm_hook_idx(u32 btf_id)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 335a19092368..252a4befeab1 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -179,7 +179,8 @@ extern struct btf_id_set name;
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)			\
 	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock)			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock)
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCKET, socket)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e81362891596..b7479898c879 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -998,6 +998,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_LSM_CGROUP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c1351df9f7ee..0f72020bfdcf 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -16,6 +16,7 @@
 #include <linux/bpf_local_storage.h>
 #include <linux/btf_ids.h>
 #include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
 
 /* For every LSM hook that allows attachment of BPF programs, declare a nop
  * function where a BPF program can be attached.
@@ -35,6 +36,44 @@ BTF_SET_START(bpf_lsm_hooks)
 #undef LSM_HOOK
 BTF_SET_END(bpf_lsm_hooks)
 
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_SET_END(bpf_lsm_current_hooks)
+
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+			     bpf_func_t *bpf_func)
+{
+	const struct btf_param *args;
+
+	if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+	    btf_id_set_contains(&bpf_lsm_current_hooks,
+				prog->aux->attach_btf_id)) {
+		*bpf_func = __cgroup_bpf_run_lsm_current;
+		return;
+	}
+
+	args = btf_params(prog->aux->attach_func_proto);
+
+#ifdef CONFIG_NET
+	if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+		*bpf_func = __cgroup_bpf_run_lsm_socket;
+	else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+		*bpf_func = __cgroup_bpf_run_lsm_sock;
+	else
+#endif
+		*bpf_func = __cgroup_bpf_run_lsm_current;
+}
+
+int bpf_lsm_hook_idx(u32 btf_id)
+{
+	return btf_id_set_index(&bpf_lsm_hooks, btf_id);
+}
+
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog)
 {
@@ -158,6 +197,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
 	case BPF_FUNC_get_attach_cookie:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+	case BPF_FUNC_get_local_storage:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_get_local_storage_proto : NULL;
+	case BPF_FUNC_set_retval:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_set_retval_proto : NULL;
+	case BPF_FUNC_get_retval:
+		return prog->expected_attach_type == BPF_LSM_CGROUP ?
+			&bpf_get_retval_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2e2066d6af94..7c1fe422ed3f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5363,6 +5363,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 	if (arg == nr_args) {
 		switch (prog->expected_attach_type) {
+		case BPF_LSM_CGROUP:
 		case BPF_LSM_MAC:
 		case BPF_TRACE_FEXIT:
 			/* When LSM programs are attached to void LSM hooks
@@ -6842,6 +6843,16 @@ static int btf_id_cmp_func(const void *a, const void *b)
 	return *pa - *pb;
 }
 
+int btf_id_set_index(const struct btf_id_set *set, u32 id)
+{
+	const u32 *p;
+
+	p = bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func);
+	if (!p)
+		return -1;
+	return p - set->ids;
+}
+
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 {
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4adb4f3ecb7f..9cf41dd4f96f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,8 @@
 #include <linux/string.h>
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
 #include <net/sock.h>
 #include <net/bpf_sk_storage.h>
 
@@ -61,6 +63,87 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 	return run_ctx.retval;
 }
 
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+				       const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct sock *sk;
+	struct cgroup *cgrp;
+	int ret = 0;
+	u64 *args;
+
+	args = (u64 *)ctx;
+	sk = (void *)(unsigned long)args[0];
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+					 const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct socket *sock;
+	struct cgroup *cgrp;
+	int ret = 0;
+	u64 *args;
+
+	args = (u64 *)ctx;
+	sock = (void *)(unsigned long)args[0];
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	const struct bpf_prog *shim_prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+	/* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+	cgrp = task_dfl_cgroup(current);
+	if (likely(cgrp))
+		ret = bpf_prog_run_array_cg(&cgrp->bpf,
+					    shim_prog->aux->cgroup_atype,
+					    ctx, bpf_prog_run, 0, NULL);
+	return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+	if (attach_type != BPF_LSM_CGROUP)
+		return to_cgroup_bpf_attach_type(attach_type);
+	return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+	if (attach_type != BPF_LSM_CGROUP)
+		return to_cgroup_bpf_attach_type(attach_type);
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
 void cgroup_bpf_offline(struct cgroup *cgrp)
 {
 	cgroup_get(cgrp);
@@ -163,10 +246,16 @@ static void cgroup_bpf_release(struct work_struct *work)
 
 		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
 			hlist_del(&pl->node);
-			if (pl->prog)
+			if (pl->prog) {
+				if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+					bpf_trampoline_unlink_cgroup_shim(pl->prog);
 				bpf_prog_put(pl->prog);
-			if (pl->link)
+			}
+			if (pl->link) {
+				if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
 				bpf_cgroup_link_auto_detach(pl->link);
+			}
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
@@ -479,6 +568,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	struct bpf_prog *new_prog = prog ? : link->link.prog;
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_list *pl;
 	struct hlist_head *progs;
@@ -495,7 +585,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
 		return -EINVAL;
 
-	atype = to_cgroup_bpf_attach_type(type);
+	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -549,17 +639,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	bpf_cgroup_storages_assign(pl->storage, storage);
 	cgrp->bpf.flags[atype] = saved_flags;
 
+	if (type == BPF_LSM_CGROUP) {
+		err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+		if (err)
+			goto cleanup;
+	}
+
 	err = update_effective_progs(cgrp, atype);
 	if (err)
-		goto cleanup;
+		goto cleanup_trampoline;
 
-	if (old_prog)
+	if (old_prog) {
+		if (type == BPF_LSM_CGROUP)
+			bpf_trampoline_unlink_cgroup_shim(old_prog);
 		bpf_prog_put(old_prog);
-	else
+	} else {
 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+	}
 	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
+cleanup_trampoline:
+	if (type == BPF_LSM_CGROUP)
+		bpf_trampoline_unlink_cgroup_shim(new_prog);
+
 cleanup:
 	if (old_prog) {
 		pl->prog = old_prog;
@@ -651,7 +754,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	struct hlist_head *progs;
 	bool found = false;
 
-	atype = to_cgroup_bpf_attach_type(link->type);
+	atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -803,9 +906,15 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct bpf_prog *old_prog;
 	struct bpf_prog_list *pl;
 	struct hlist_head *progs;
+	u32 attach_btf_id = 0;
 	u32 flags;
 
-	atype = to_cgroup_bpf_attach_type(type);
+	if (prog)
+		attach_btf_id = prog->aux->attach_btf_id;
+	if (link)
+		attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+	atype = bpf_cgroup_atype_find(type, attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -839,8 +948,11 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (hlist_empty(progs))
 		/* last program was detached, reset flags to zero */
 		cgrp->bpf.flags[atype] = 0;
-	if (old_prog)
+	if (old_prog) {
+		if (type == BPF_LSM_CGROUP)
+			bpf_trampoline_unlink_cgroup_shim(old_prog);
 		bpf_prog_put(old_prog);
+	}
 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 	return 0;
 }
@@ -999,6 +1111,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
 				    cg_link->type));
+	if (cg_link->type == BPF_LSM_CGROUP)
+		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
 
 	cg = cg_link->cgroup;
 	cg_link->cgroup = NULL;
@@ -1343,7 +1457,7 @@ BPF_CALL_0(bpf_get_retval)
 	return ctx->retval;
 }
 
-static const struct bpf_func_proto bpf_get_retval_proto = {
+const struct bpf_func_proto bpf_get_retval_proto = {
 	.func		= bpf_get_retval,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
@@ -1358,7 +1472,7 @@ BPF_CALL_1(bpf_set_retval, int, retval)
 	return 0;
 }
 
-static const struct bpf_func_proto bpf_set_retval_proto = {
+const struct bpf_func_proto bpf_set_retval_proto = {
 	.func		= bpf_set_retval,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f023cb399e3f..4cc10b942a3c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2666,6 +2666,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
 const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7d5af5b99f0d..626b8f7d237b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3416,6 +3416,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_LOOKUP;
 	case BPF_XDP:
 		return BPF_PROG_TYPE_XDP;
+	case BPF_LSM_CGROUP:
+		return BPF_PROG_TYPE_LSM;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3469,6 +3471,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_LSM:
+		if (ptype == BPF_PROG_TYPE_LSM &&
+		    prog->expected_attach_type != BPF_LSM_CGROUP)
+			return -EINVAL;
+
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
 	default:
@@ -3506,6 +3513,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_LSM:
 		return cgroup_bpf_prog_detach(attr, ptype);
 	default:
 		return -EINVAL;
@@ -4540,6 +4548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_raw_tp_link_attach(prog, NULL);
 		else if (prog->expected_attach_type == BPF_TRACE_ITER)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
+		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+			ret = cgroup_bpf_link_attach(attr, prog);
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5466e15be61f..d7c251d7fbcd 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -11,6 +11,8 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/module.h>
 #include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -496,6 +498,177 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
 	return err;
 }
 
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+	struct bpf_shim_tramp_link *shim_link =
+		container_of(link, struct bpf_shim_tramp_link, link.link);
+
+	/* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+	if (!shim_link->trampoline)
+		return;
+
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+	bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_shim_tramp_link *shim_link =
+		container_of(link, struct bpf_shim_tramp_link, link.link);
+
+	kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+	.release = bpf_shim_tramp_link_release,
+	.dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+						     bpf_func_t bpf_func,
+						     int cgroup_atype)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_prog *p;
+
+	shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+	if (!shim_link)
+		return NULL;
+
+	p = bpf_prog_alloc(1, 0);
+	if (!p) {
+		kfree(shim_link);
+		return NULL;
+	}
+
+	p->jited = false;
+	p->bpf_func = bpf_func;
+
+	p->aux->cgroup_atype = cgroup_atype;
+	p->aux->attach_func_proto = prog->aux->attach_func_proto;
+	p->aux->attach_btf_id = prog->aux->attach_btf_id;
+	p->aux->attach_btf = prog->aux->attach_btf;
+	btf_get(p->aux->attach_btf);
+	p->type = BPF_PROG_TYPE_LSM;
+	p->expected_attach_type = BPF_LSM_MAC;
+	bpf_prog_inc(p);
+	bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+		      &bpf_shim_tramp_link_lops, p);
+
+	return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+						    bpf_func_t bpf_func)
+{
+	struct bpf_tramp_link *link;
+	int kind;
+
+	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+			struct bpf_prog *p = link->link.prog;
+
+			if (p->bpf_func == bpf_func)
+				return container_of(link, struct bpf_shim_tramp_link, link);
+		}
+	}
+
+	return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+				    int cgroup_atype)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_attach_target_info tgt_info = {};
+	struct bpf_trampoline *tr;
+	bpf_func_t bpf_func;
+	u64 key;
+	int err;
+
+	err = bpf_check_attach_target(NULL, prog, NULL,
+				      prog->aux->attach_btf_id,
+				      &tgt_info);
+	if (err)
+		return err;
+
+	key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+					 prog->aux->attach_btf_id);
+
+	bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+	tr = bpf_trampoline_get(key, &tgt_info);
+	if (!tr)
+		return  -ENOMEM;
+
+	mutex_lock(&tr->mutex);
+
+	shim_link = cgroup_shim_find(tr, bpf_func);
+	if (shim_link) {
+		/* Reusing existing shim attached by the other program. */
+		bpf_link_inc(&shim_link->link.link);
+
+		mutex_unlock(&tr->mutex);
+		bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+		return 0;
+	}
+
+	/* Allocate and install new shim. */
+
+	shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+	if (!shim_link) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+	if (err)
+		goto err;
+
+	shim_link->trampoline = tr;
+	/* note, we're still holding tr refcnt from above */
+
+	mutex_unlock(&tr->mutex);
+
+	return 0;
+err:
+	mutex_unlock(&tr->mutex);
+
+	if (shim_link)
+		bpf_link_put(&shim_link->link.link);
+
+	/* have to release tr while _not_ holding its mutex */
+	bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+	return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+	struct bpf_shim_tramp_link *shim_link = NULL;
+	struct bpf_trampoline *tr;
+	bpf_func_t bpf_func;
+	u64 key;
+
+	key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+					 prog->aux->attach_btf_id);
+
+	bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+	tr = bpf_trampoline_lookup(key);
+	if (WARN_ON_ONCE(!tr))
+		return;
+
+	mutex_lock(&tr->mutex);
+	shim_link = cgroup_shim_find(tr, bpf_func);
+	mutex_unlock(&tr->mutex);
+
+	if (shim_link)
+		bpf_link_put(&shim_link->link.link);
+
+	bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info)
 {
@@ -628,6 +801,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
 	rcu_read_unlock();
 }
 
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	/* Runtime stats are exported via actual BPF_LSM_CGROUP
+	 * programs, not the shims.
+	 */
+	rcu_read_lock();
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	return NO_START_TIME;
+}
+
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	migrate_enable();
+	rcu_read_unlock();
+}
+
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 {
 	rcu_read_lock_trace();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4938477912cd..df3ec6b05f05 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7322,6 +7322,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				reg_type_str(env, regs[BPF_REG_1].type));
 			return -EACCES;
 		}
+		break;
+	case BPF_FUNC_set_retval:
+		if (env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+			if (!env->prog->aux->attach_func_proto->type) {
+				/* Make sure programs that attach to void
+				 * hooks don't try to modify return value.
+				 */
+				verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+				return -EINVAL;
+			}
+		}
+		break;
 	}
 
 	if (err)
@@ -10527,6 +10539,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		range = tnum_range(SK_DROP, SK_PASS);
 		break;
+
+	case BPF_PROG_TYPE_LSM:
+		if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+			/* Regular BPF_PROG_TYPE_LSM programs can return
+			 * any value.
+			 */
+			return 0;
+		}
+		if (!env->prog->aux->attach_func_proto->type) {
+			/* Make sure programs that attach to void
+			 * hooks don't try to modify return value.
+			 */
+			range = tnum_range(1, 1);
+		}
+		break;
+
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
@@ -10543,6 +10571,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	if (!tnum_in(range, reg->var_off)) {
 		verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+		if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+		    !prog->aux->attach_func_proto->type)
+			verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
 		return -EINVAL;
 	}
 
@@ -14902,6 +14933,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		fallthrough;
 	case BPF_MODIFY_RETURN:
 	case BPF_LSM_MAC:
+	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 		if (!btf_type_is_func(t)) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e81362891596..b7479898c879 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -998,6 +998,7 @@ enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_LSM_CGROUP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From b79c9fc9551b45953a94abf550b7bd3b00e3a0f9 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jun 2022 10:43:08 -0700
Subject: bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP

We have two options:
1. Treat all BPF_LSM_CGROUP the same, regardless of attach_btf_id
2. Treat BPF_LSM_CGROUP+attach_btf_id as a separate hook point

I was doing (2) in the original patch, but switching to (1) here:

* bpf_prog_query returns all attached BPF_LSM_CGROUP programs
regardless of attach_btf_id
* attach_btf_id is exported via bpf_prog_info

Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220628174314.1216643-6-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 ++
 kernel/bpf/cgroup.c      | 95 ++++++++++++++++++++++++++++++++----------------
 kernel/bpf/syscall.c     |  8 +++-
 3 files changed, 74 insertions(+), 32 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b7479898c879..ad9e7311c4cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1432,6 +1432,7 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
+		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
@@ -6076,6 +6077,8 @@ struct bpf_prog_info {
 	__u64 run_cnt;
 	__u64 recursion_misses;
 	__u32 verified_insns;
+	__u32 attach_btf_obj_id;
+	__u32 attach_btf_id;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 169cbd0de797..59b7eb60d5b4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1017,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			      union bpf_attr __user *uattr)
 {
+	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
+	enum cgroup_bpf_attach_type from_atype, to_atype;
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog_array *effective;
-	struct hlist_head *progs;
-	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
+	int total_cnt = 0;
 	u32 flags;
 
-	atype = to_cgroup_bpf_attach_type(type);
-	if (atype < 0)
-		return -EINVAL;
-
-	progs = &cgrp->bpf.progs[atype];
-	flags = cgrp->bpf.flags[atype];
+	if (type == BPF_LSM_CGROUP) {
+		if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+			return -EINVAL;
 
-	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
-					      lockdep_is_held(&cgroup_mutex));
+		from_atype = CGROUP_LSM_START;
+		to_atype = CGROUP_LSM_END;
+		flags = 0;
+	} else {
+		from_atype = to_cgroup_bpf_attach_type(type);
+		if (from_atype < 0)
+			return -EINVAL;
+		to_atype = from_atype;
+		flags = cgrp->bpf.flags[from_atype];
+	}
 
-	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
-		cnt = bpf_prog_array_length(effective);
-	else
-		cnt = prog_list_length(progs);
+	for (atype = from_atype; atype <= to_atype; atype++) {
+		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+							      lockdep_is_held(&cgroup_mutex));
+			total_cnt += bpf_prog_array_length(effective);
+		} else {
+			total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+		}
+	}
 
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 		return -EFAULT;
-	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
 		return -EFAULT;
-	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
 		/* return early if user requested only program count + flags */
 		return 0;
-	if (attr->query.prog_cnt < cnt) {
-		cnt = attr->query.prog_cnt;
+
+	if (attr->query.prog_cnt < total_cnt) {
+		total_cnt = attr->query.prog_cnt;
 		ret = -ENOSPC;
 	}
 
-	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
-		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
-	} else {
-		struct bpf_prog_list *pl;
-		u32 id;
+	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+							      lockdep_is_held(&cgroup_mutex));
+			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+		} else {
+			struct hlist_head *progs;
+			struct bpf_prog_list *pl;
+			struct bpf_prog *prog;
+			u32 id;
+
+			progs = &cgrp->bpf.progs[atype];
+			cnt = min_t(int, prog_list_length(progs), total_cnt);
+			i = 0;
+			hlist_for_each_entry(pl, progs, node) {
+				prog = prog_list_prog(pl);
+				id = prog->aux->id;
+				if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+					return -EFAULT;
+				if (++i == cnt)
+					break;
+			}
+		}
 
-		i = 0;
-		hlist_for_each_entry(pl, progs, node) {
-			prog = prog_list_prog(pl);
-			id = prog->aux->id;
-			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
-				return -EFAULT;
-			if (++i == cnt)
-				break;
+		if (prog_attach_flags) {
+			flags = cgrp->bpf.flags[atype];
+
+			for (i = 0; i < cnt; i++)
+				if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
+					return -EFAULT;
+			prog_attach_flags += cnt;
 		}
+
+		prog_ids += cnt;
+		total_cnt -= cnt;
 	}
 	return ret;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 626b8f7d237b..ab688d85b2c6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3520,7 +3520,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	}
 }
 
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
 
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
@@ -3556,6 +3556,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SYSCTL:
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
+	case BPF_LSM_CGROUP:
 		return cgroup_bpf_prog_query(attr, uattr);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
@@ -4066,6 +4067,11 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 
 	if (prog->aux->btf)
 		info.btf_id = btf_obj_id(prog->aux->btf);
+	info.attach_btf_id = prog->aux->attach_btf_id;
+	if (prog->aux->attach_btf)
+		info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf);
+	else if (prog->aux->dst_prog)
+		info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf);
 
 	ulen = info.nr_func_info;
 	info.nr_func_info = prog->aux->func_info_cnt;
-- 
cgit v1.2.3


From 211da42eaa45db7b0edfde187dd88a85fbd466b5 Mon Sep 17 00:00:00 2001
From: Yuwei Wang <wangyuweihx@gmail.com>
Date: Wed, 29 Jun 2022 08:48:32 +0000
Subject: net, neigh: introduce interval_probe_time_ms for periodic probe

commit ed6cd6a17896 ("net, neigh: Set lower cap for neigh_managed_work rearming")
fixed a case when DELAY_PROBE_TIME is configured to 0, the processing of the
system work queue hog CPU to 100%, and further more we should introduce
a new option used by periodic probe

Signed-off-by: Yuwei Wang <wangyuweihx@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst |  6 ++++++
 include/net/neighbour.h                |  1 +
 include/uapi/linux/neighbour.h         |  1 +
 include/uapi/linux/sysctl.h            | 37 +++++++++++++++++-----------------
 net/core/neighbour.c                   | 32 +++++++++++++++++++++++++++--
 net/decnet/dn_neigh.c                  |  1 +
 net/ipv4/arp.c                         |  1 +
 net/ipv6/ndisc.c                       |  1 +
 8 files changed, 60 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 9f41961d11d5..4c8bbf5acfd1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -202,6 +202,12 @@ neigh/default/unres_qlen - INTEGER
 
 	Default: 101
 
+neigh/default/interval_probe_time_ms - INTEGER
+	The probe interval for neighbor entries with NTF_MANAGED flag,
+	the min value is 1.
+
+	Default: 5000
+
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
 
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 87419f7f5421..9f0bab0589d9 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -48,6 +48,7 @@ enum {
 	NEIGH_VAR_RETRANS_TIME,
 	NEIGH_VAR_BASE_REACHABLE_TIME,
 	NEIGH_VAR_DELAY_PROBE_TIME,
+	NEIGH_VAR_INTERVAL_PROBE_TIME_MS,
 	NEIGH_VAR_GC_STALETIME,
 	NEIGH_VAR_QUEUE_LEN_BYTES,
 	NEIGH_VAR_PROXY_QLEN,
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 39c565e460c7..a998bf761635 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -154,6 +154,7 @@ enum {
 	NDTPA_QUEUE_LENBYTES,		/* u32 */
 	NDTPA_MCAST_REPROBES,		/* u32 */
 	NDTPA_PAD,
+	NDTPA_INTERVAL_PROBE_TIME_MS,	/* u64, msecs */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 6a3b194c50fe..8981f00204db 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -584,24 +584,25 @@ enum {
 
 /* /proc/sys/net/<protocol>/neigh/<dev> */
 enum {
-	NET_NEIGH_MCAST_SOLICIT=1,
-	NET_NEIGH_UCAST_SOLICIT=2,
-	NET_NEIGH_APP_SOLICIT=3,
-	NET_NEIGH_RETRANS_TIME=4,
-	NET_NEIGH_REACHABLE_TIME=5,
-	NET_NEIGH_DELAY_PROBE_TIME=6,
-	NET_NEIGH_GC_STALE_TIME=7,
-	NET_NEIGH_UNRES_QLEN=8,
-	NET_NEIGH_PROXY_QLEN=9,
-	NET_NEIGH_ANYCAST_DELAY=10,
-	NET_NEIGH_PROXY_DELAY=11,
-	NET_NEIGH_LOCKTIME=12,
-	NET_NEIGH_GC_INTERVAL=13,
-	NET_NEIGH_GC_THRESH1=14,
-	NET_NEIGH_GC_THRESH2=15,
-	NET_NEIGH_GC_THRESH3=16,
-	NET_NEIGH_RETRANS_TIME_MS=17,
-	NET_NEIGH_REACHABLE_TIME_MS=18,
+	NET_NEIGH_MCAST_SOLICIT = 1,
+	NET_NEIGH_UCAST_SOLICIT = 2,
+	NET_NEIGH_APP_SOLICIT = 3,
+	NET_NEIGH_RETRANS_TIME = 4,
+	NET_NEIGH_REACHABLE_TIME = 5,
+	NET_NEIGH_DELAY_PROBE_TIME = 6,
+	NET_NEIGH_GC_STALE_TIME = 7,
+	NET_NEIGH_UNRES_QLEN = 8,
+	NET_NEIGH_PROXY_QLEN = 9,
+	NET_NEIGH_ANYCAST_DELAY = 10,
+	NET_NEIGH_PROXY_DELAY = 11,
+	NET_NEIGH_LOCKTIME = 12,
+	NET_NEIGH_GC_INTERVAL = 13,
+	NET_NEIGH_GC_THRESH1 = 14,
+	NET_NEIGH_GC_THRESH2 = 15,
+	NET_NEIGH_GC_THRESH3 = 16,
+	NET_NEIGH_RETRANS_TIME_MS = 17,
+	NET_NEIGH_REACHABLE_TIME_MS = 18,
+	NET_NEIGH_INTERVAL_PROBE_TIME_MS = 19,
 };
 
 /* /proc/sys/net/dccp */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d8ec70622ecb..6a8c2596ebab 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1579,7 +1579,7 @@ static void neigh_managed_work(struct work_struct *work)
 	list_for_each_entry(neigh, &tbl->managed_list, managed_list)
 		neigh_event_send_probe(neigh, NULL, false);
 	queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
-			   max(NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME), HZ));
+			   NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
 	write_unlock_bh(&tbl->lock);
 }
 
@@ -2100,7 +2100,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 	    nla_put_msecs(skb, NDTPA_PROXY_DELAY,
 			  NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
 	    nla_put_msecs(skb, NDTPA_LOCKTIME,
-			  NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+			  NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+	    nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+			  NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
 		goto nla_put_failure;
 	return nla_nest_end(skb, nest);
 
@@ -2255,6 +2257,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
 	[NDTPA_ANYCAST_DELAY]		= { .type = NLA_U64 },
 	[NDTPA_PROXY_DELAY]		= { .type = NLA_U64 },
 	[NDTPA_LOCKTIME]		= { .type = NLA_U64 },
+	[NDTPA_INTERVAL_PROBE_TIME_MS]	= { .type = NLA_U64, .min = 1 },
 };
 
 static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2373,6 +2376,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
 					      nla_get_msecs(tbp[i]));
 				call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
 				break;
+			case NDTPA_INTERVAL_PROBE_TIME_MS:
+				NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+					      nla_get_msecs(tbp[i]));
+				break;
 			case NDTPA_RETRANS_TIME:
 				NEIGH_VAR_SET(p, RETRANS_TIME,
 					      nla_get_msecs(tbp[i]));
@@ -3562,6 +3569,22 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+						   void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp = *ctl;
+	int ret;
+
+	int min = msecs_to_jiffies(1);
+
+	tmp.extra1 = &min;
+	tmp.extra2 = NULL;
+
+	ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+	neigh_proc_update(ctl, write);
+	return ret;
+}
+
 int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
 			size_t *lenp, loff_t *ppos)
 {
@@ -3658,6 +3681,9 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
 #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
 	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
 
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
+
 #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
 	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
 
@@ -3676,6 +3702,8 @@ static struct neigh_sysctl_table {
 		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+		NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+						       "interval_probe_time_ms"),
 		NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
 		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
 		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index fbd98ac853ea..7c569bcc0aca 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -94,6 +94,7 @@ struct neigh_table dn_neigh_table = {
 			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ab4a5601c82a..af2f12ffc9ca 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -168,6 +168,7 @@ struct neigh_table arp_tbl = {
 			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b0dfe97ea4ee..cd84cbdac0a2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -128,6 +128,7 @@ struct neigh_table nd_tbl = {
 			[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
 			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
-- 
cgit v1.2.3


From 29c1ac230e6056b26846c66881802b581a78ad72 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jun 2022 13:25:57 +0100
Subject: io_uring: keep sendrecv flags in ioprio

We waste a u64 SQE field for flags even though we don't need as many
bits and it can be used for something more useful later. Store io_uring
specific send/recv flags in sqe->ioprio instead of ->addr2.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Fixes: 0455d4ccec54 ("io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg")
[axboe: change comment in io_uring.h as well]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 12 ++++++++----
 include/uapi/linux/io_uring.h |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ff2cdb425bc..aeb042ba5cc5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1183,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.needs_async_setup	= 1,
+		.ioprio			= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 	},
 	[IORING_OP_RECVMSG] = {
@@ -1191,6 +1192,7 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.buffer_select		= 1,
 		.needs_async_setup	= 1,
+		.ioprio			= 1,
 		.async_size		= sizeof(struct io_async_msghdr),
 	},
 	[IORING_OP_TIMEOUT] = {
@@ -1266,6 +1268,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
 	},
 	[IORING_OP_RECV] = {
 		.needs_file		= 1,
@@ -1273,6 +1276,7 @@ static const struct io_op_def io_op_defs[] = {
 		.pollin			= 1,
 		.buffer_select		= 1,
 		.audit_skip		= 1,
+		.ioprio			= 1,
 	},
 	[IORING_OP_OPENAT2] = {
 	},
@@ -6075,12 +6079,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->file_index))
+	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
-	sr->flags = READ_ONCE(sqe->addr2);
+	sr->flags = READ_ONCE(sqe->ioprio);
 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -6311,12 +6315,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 
-	if (unlikely(sqe->file_index))
+	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
 
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
-	sr->flags = READ_ONCE(sqe->addr2);
+	sr->flags = READ_ONCE(sqe->ioprio);
 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 53e7dae92e42..f10b59d6693e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -244,7 +244,7 @@ enum io_uring_op {
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
 
 /*
- * send/sendmsg and recv/recvmsg flags (sqe->addr2)
+ * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
  *
  * IORING_RECVSEND_POLL_FIRST	If set, instead of first attempting to send
  *				or receive and arm poll if that yields an
-- 
cgit v1.2.3


From d7cfea332cca483ccd3970ed4de73ed892643e51 Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Thu, 20 Jan 2022 15:06:30 -0500
Subject: drm/amdkfd: Add KFD SMI event IDs and triggers

Define new system management interface event IDs for migration, GPU
recoverable page fault, user queues eviction, restore and unmap from
GPU events and corresponding event triggers, those will be implemented
in the following patches.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 231eb010b823..280edda46faf 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -470,6 +470,43 @@ enum kfd_smi_event {
 	KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
 	KFD_SMI_EVENT_GPU_PRE_RESET = 3,
 	KFD_SMI_EVENT_GPU_POST_RESET = 4,
+	KFD_SMI_EVENT_MIGRATE_START = 5,
+	KFD_SMI_EVENT_MIGRATE_END = 6,
+	KFD_SMI_EVENT_PAGE_FAULT_START = 7,
+	KFD_SMI_EVENT_PAGE_FAULT_END = 8,
+	KFD_SMI_EVENT_QUEUE_EVICTION = 9,
+	KFD_SMI_EVENT_QUEUE_RESTORE = 10,
+	KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,
+
+	/*
+	 * max event number, as a flag bit to get events from all processes,
+	 * this requires super user permission, otherwise will not be able to
+	 * receive event from any process. Without this flag to receive events
+	 * from same process.
+	 */
+	KFD_SMI_EVENT_ALL_PROCESS = 64
+};
+
+enum KFD_MIGRATE_TRIGGERS {
+	KFD_MIGRATE_TRIGGER_PREFETCH,
+	KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
+	KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
+	KFD_MIGRATE_TRIGGER_TTM_EVICTION
+};
+
+enum KFD_QUEUE_EVICTION_TRIGGERS {
+	KFD_QUEUE_EVICTION_TRIGGER_SVM,
+	KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
+	KFD_QUEUE_EVICTION_TRIGGER_TTM,
+	KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
+	KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
+	KFD_QUEUE_EVICTION_CRIU_RESTORE
+};
+
+enum KFD_SVM_UNMAP_TRIGGERS {
+	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
+	KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
+	KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
 };
 
 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
-- 
cgit v1.2.3


From 3db7f894549a54dd6f8dfb7fe213ede83249bb94 Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Wed, 19 Jan 2022 12:57:26 -0500
Subject: drm/amdkfd: Bump KFD API version for SMI profiling event

Indicate SMI profiling events available.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 280edda46faf..d993a0d50994 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -35,9 +35,10 @@
  * - 1.7 - Checkpoint Restore (CRIU) API
  * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
  * - 1.9 - Add available memory ioctl
+ * - 1.10 - Add SMI profiler event log
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 9
+#define KFD_IOCTL_MINOR_VERSION 10
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
cgit v1.2.3


From ecad3b0b99bff7247a11f8c7cb19ac9b0cb28b09 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Mon, 23 May 2022 18:55:58 +0530
Subject: wifi: cfg80211: Increase akm_suites array size in
 cfg80211_crypto_settings

Increase akm_suites array size in struct cfg80211_crypto_settings to 10
and advertise the capability to userspace. This allows userspace to send
more than two AKMs to driver in netlink commands such as
NL80211_CMD_CONNECT.

This capability is needed for implementing WPA3-Personal transition mode
correctly with any driver that handles roaming internally. Currently,
the possible AKMs for multi-AKM connect can include PSK, PSK-SHA-256,
SAE, FT-PSK and FT-SAE. Since the count is already 5, increasing
the akm_suites array size to 10 should be reasonable for future
usecases.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1653312358-12321-1-git-send-email-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.c | 12 ++++++++----
 include/net/cfg80211.h                            | 11 ++++++++++-
 include/uapi/linux/nl80211.h                      | 14 ++++++++++++++
 net/wireless/core.c                               |  6 ++++++
 net/wireless/nl80211.c                            |  7 ++++++-
 5 files changed, 44 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index 3d734a7a5ba8..0fad53693292 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -241,6 +241,7 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif,
 	struct qlink_auth_encr *aen;
 	int ret;
 	int i;
+	int n;
 
 	if (!qtnf_cmd_start_ap_can_fit(vif, s))
 		return -E2BIG;
@@ -280,8 +281,9 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif,
 	for (i = 0; i < QLINK_MAX_NR_CIPHER_SUITES; i++)
 		aen->ciphers_pairwise[i] =
 				cpu_to_le32(s->crypto.ciphers_pairwise[i]);
-	aen->n_akm_suites = cpu_to_le32(s->crypto.n_akm_suites);
-	for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++)
+	n = min(QLINK_MAX_NR_AKM_SUITES, s->crypto.n_akm_suites);
+	aen->n_akm_suites = cpu_to_le32(n);
+	for (i = 0; i < n; i++)
 		aen->akm_suites[i] = cpu_to_le32(s->crypto.akm_suites[i]);
 	aen->control_port = s->crypto.control_port;
 	aen->control_port_no_encrypt = s->crypto.control_port_no_encrypt;
@@ -2076,6 +2078,7 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 	struct qlink_auth_encr *aen;
 	int ret;
 	int i;
+	int n;
 	u32 connect_flags = 0;
 
 	cmd_skb = qtnf_cmd_alloc_new_cmdskb(vif->mac->macid, vif->vifid,
@@ -2132,9 +2135,10 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 		aen->ciphers_pairwise[i] =
 			cpu_to_le32(sme->crypto.ciphers_pairwise[i]);
 
-	aen->n_akm_suites = cpu_to_le32(sme->crypto.n_akm_suites);
+	n = min(QLINK_MAX_NR_AKM_SUITES, sme->crypto.n_akm_suites);
+	aen->n_akm_suites = cpu_to_le32(n);
 
-	for (i = 0; i < QLINK_MAX_NR_AKM_SUITES; i++)
+	for (i = 0; i < n; i++)
 		aen->akm_suites[i] = cpu_to_le32(sme->crypto.akm_suites[i]);
 
 	aen->control_port = sme->crypto.control_port;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 87ebed6a48bd..6bc161d653f3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1063,6 +1063,7 @@ struct survey_info {
 };
 
 #define CFG80211_MAX_WEP_KEYS	4
+#define CFG80211_MAX_NUM_AKM_SUITES	10
 
 /**
  * struct cfg80211_crypto_settings - Crypto settings
@@ -1114,7 +1115,7 @@ struct cfg80211_crypto_settings {
 	int n_ciphers_pairwise;
 	u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES];
 	int n_akm_suites;
-	u32 akm_suites[NL80211_MAX_NR_AKM_SUITES];
+	u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES];
 	bool control_port;
 	__be16 control_port_ethertype;
 	bool control_port_no_encrypt;
@@ -5200,6 +5201,13 @@ struct wiphy_iftype_akm_suites {
  * @ema_max_profile_periodicity: maximum profile periodicity supported by
  *	the driver. Setting this field to a non-zero value indicates that the
  *	driver supports enhanced multi-BSSID advertisements (EMA AP).
+ * @max_num_akm_suites: maximum number of AKM suites allowed for
+ *	configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by
+ *	driver. If set by driver minimum allowed value is
+ *	NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
+ *	legacy userspace and maximum allowed value is
+ *	CFG80211_MAX_NUM_AKM_SUITES.
  */
 struct wiphy {
 	struct mutex mtx;
@@ -5346,6 +5354,7 @@ struct wiphy {
 
 	u8 mbssid_max_interfaces;
 	u8 ema_max_profile_periodicity;
+	u16 max_num_akm_suites;
 
 	char priv[] __aligned(NETDEV_ALIGN);
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 89f64f46b98d..279f9715919e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2694,6 +2694,13 @@ enum nl80211_commands {
  *	connection. Used with %NL80211_CMD_CONNECT. If this attribute is not
  *	included in NL80211_CMD_CONNECT drivers must not perform MLO connection.
  *
+ * @NL80211_ATTR_MAX_NUM_AKM_SUITES: U16 attribute. Indicates maximum number of
+ *	AKM suites allowed for %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
+ *	%NL80211_CMD_START_AP in %NL80211_CMD_GET_WIPHY response. If this
+ *	attribute is not present userspace shall consider maximum number of AKM
+ *	suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum
+ *	number prior to the introduction of this attribute.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3214,6 +3221,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MLO_SUPPORT,
 
+	NL80211_ATTR_MAX_NUM_AKM_SUITES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3268,6 +3277,11 @@ enum nl80211_attrs {
 #define NL80211_HE_MIN_CAPABILITY_LEN           16
 #define NL80211_HE_MAX_CAPABILITY_LEN           54
 #define NL80211_MAX_NR_CIPHER_SUITES		5
+
+/*
+ * NL80211_MAX_NR_AKM_SUITES is obsolete when %NL80211_ATTR_MAX_NUM_AKM_SUITES
+ * present in %NL80211_CMD_GET_WIPHY response.
+ */
 #define NL80211_MAX_NR_AKM_SUITES		2
 #define NL80211_EHT_MIN_CAPABILITY_LEN          13
 #define NL80211_EHT_MAX_CAPABILITY_LEN          51
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 3e5d12040726..6b5321bb1176 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -913,6 +913,12 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 #endif
 
+	if (!wiphy->max_num_akm_suites)
+		wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES;
+	else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES ||
+		 wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES)
+		return -EINVAL;
+
 	/* check and set up bitrates */
 	ieee80211_set_bitrate_flags(wiphy);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b583a76ef492..e2b6740268a6 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -798,6 +798,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
 	[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
 	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
 };
 
 /* policy for the key attributes */
@@ -2932,6 +2933,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		if (nl80211_put_mbssid_support(&rdev->wiphy, msg))
 			goto nla_put_failure;
 
+		if (nla_put_u16(msg, NL80211_ATTR_MAX_NUM_AKM_SUITES,
+				rdev->wiphy.max_num_akm_suites))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -10431,7 +10436,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 		if (len % sizeof(u32))
 			return -EINVAL;
 
-		if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES)
+		if (settings->n_akm_suites > rdev->wiphy.max_num_akm_suites)
 			return -EINVAL;
 
 		memcpy(settings->akm_suites, data, len);
-- 
cgit v1.2.3


From e252f2ed1c8c6c3884ab5dd34e003ed21f1fe6e0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 29 Jun 2022 17:42:10 +0300
Subject: fanotify: introduce FAN_MARK_IGNORE

This flag is a new way to configure ignore mask which allows adding and
removing the event flags FAN_ONDIR and FAN_EVENT_ON_CHILD in ignore mask.

The legacy FAN_MARK_IGNORED_MASK flag would always ignore events on
directories and would ignore events on children depending on whether
the FAN_EVENT_ON_CHILD flag was set in the (non ignored) mask.

FAN_MARK_IGNORE can be used to ignore events on children without setting
FAN_EVENT_ON_CHILD in the mark's mask and will not ignore events on
directories unconditionally, only when FAN_ONDIR is set in ignore mask.

The new behavior is non-downgradable.  After calling fanotify_mark() with
FAN_MARK_IGNORE once, calling fanotify_mark() with FAN_MARK_IGNORED_MASK
on the same object will return EEXIST error.

Setting the event flags with FAN_MARK_IGNORE on a non-dir inode mark
has no meaning and will return ENOTDIR error.

The meaning of FAN_MARK_IGNORED_SURV_MODIFY is preserved with the new
FAN_MARK_IGNORE flag, but with a few semantic differences:

1. FAN_MARK_IGNORED_SURV_MODIFY is required for filesystem and mount
   marks and on an inode mark on a directory. Omitting this flag
   will return EINVAL or EISDIR error.

2. An ignore mask on a non-directory inode that survives modify could
   never be downgraded to an ignore mask that does not survive modify.
   With new FAN_MARK_IGNORE semantics we make that rule explicit -
   trying to update a surviving ignore mask without the flag
   FAN_MARK_IGNORED_SURV_MODIFY will return EEXIST error.

The conveniene macro FAN_MARK_IGNORE_SURV is added for
(FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY), because the
common case should use short constant names.

Link: https://lore.kernel.org/r/20220629144210.2983229-4-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.h      |  2 ++
 fs/notify/fanotify/fanotify_user.c | 63 ++++++++++++++++++++++++++++++++------
 include/linux/fanotify.h           |  5 ++-
 include/uapi/linux/fanotify.h      |  8 +++++
 4 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 80e0ec95b113..1d9f11255c64 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
 	if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
 		mflags |= FAN_MARK_EVICTABLE;
+	if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+		mflags |= FAN_MARK_IGNORE;
 
 	return mflags;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6781d46cd37c..f0e49a406ffa 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1009,7 +1009,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 	mask &= ~umask;
 	spin_lock(&fsn_mark->lock);
 	oldmask = fsnotify_calc_mask(fsn_mark);
-	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
 		fsn_mark->mask &= ~mask;
 	} else {
 		fsn_mark->ignore_mask &= ~mask;
@@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 				       unsigned int fan_flags)
 {
 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
 	bool recalc = false;
 
+	/*
+	 * When using FAN_MARK_IGNORE for the first time, mark starts using
+	 * independent event flags in ignore mask.  After that, trying to
+	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
+	 * will result in EEXIST error.
+	 */
+	if (ignore == FAN_MARK_IGNORE)
+		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
+
 	/*
 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
 	 * because of an ignore mask that is now going to survive FS_MODIFY.
 	 */
-	if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
-	    (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 		if (!(fsn_mark->mask & FS_MODIFY))
@@ -1120,7 +1129,7 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	bool recalc;
 
 	spin_lock(&fsn_mark->lock);
-	if (!(fan_flags & FAN_MARK_IGNORED_MASK))
+	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
 		fsn_mark->mask |= mask;
 	else
 		fsn_mark->ignore_mask |= mask;
@@ -1197,6 +1206,24 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
 		return -EEXIST;
 
+	/*
+	 * New ignore mask semantics cannot be downgraded to old semantics.
+	 */
+	if (fan_flags & FAN_MARK_IGNORED_MASK &&
+	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+		return -EEXIST;
+
+	/*
+	 * An ignore mask that survives modify could never be downgraded to not
+	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
+	 * explicit and return an error when trying to update the ignore mask
+	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
+	 */
+	if (fan_flags & FAN_MARK_IGNORE &&
+	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+		return -EEXIST;
+
 	return 0;
 }
 
@@ -1231,7 +1258,8 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	 * Error events are pre-allocated per group, only if strictly
 	 * needed (i.e. FAN_FS_ERROR was requested).
 	 */
-	if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
+	    (mask & FAN_FS_ERROR)) {
 		ret = fanotify_group_init_error_pool(group);
 		if (ret)
 			goto out;
@@ -1275,7 +1303,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	 * an ignore mask, unless that ignore mask is supposed to survive
 	 * modification changes anyway.
 	 */
-	if ((flags & FAN_MARK_IGNORED_MASK) &&
+	if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
 	    inode_is_open_for_write(inode))
 		return 0;
@@ -1531,7 +1559,8 @@ static int fanotify_events_supported(struct fsnotify_group *group,
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
-				 (mask & FAN_RENAME);
+				 (mask & FAN_RENAME) ||
+				 (flags & FAN_MARK_IGNORE);
 
 	/*
 	 * Some filesystems such as 'proc' acquire unusual locks when opening
@@ -1569,7 +1598,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
-	bool ignore = flags & FAN_MARK_IGNORED_MASK;
+	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
 	unsigned int obj_type, fid_mode;
 	u32 umask = 0;
 	int ret;
@@ -1618,12 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (mask & ~valid_mask)
 		return -EINVAL;
 
+
+	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
+	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
+		return -EINVAL;
+
 	/*
 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
 	 * FAN_MARK_IGNORED_MASK.
 	 */
-	if (ignore)
+	if (ignore == FAN_MARK_IGNORED_MASK) {
 		mask &= ~FANOTIFY_EVENT_FLAGS;
+		umask = FANOTIFY_EVENT_FLAGS;
+	}
 
 	f = fdget(fanotify_fd);
 	if (unlikely(!f.file))
@@ -1727,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	else
 		mnt = path.mnt;
 
+	ret = mnt ? -EINVAL : -EISDIR;
+	/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+	if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
+	    (mnt || S_ISDIR(inode->i_mode)) &&
+	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
+		goto path_put_and_out;
+
 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
 	if (mnt || !S_ISDIR(inode->i_mode)) {
 		mask &= ~FAN_EVENT_ON_CHILD;
@@ -1819,7 +1862,7 @@ static int __init fanotify_user_setup(void)
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
 					 SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index a7207f092fd1..8ad743def6f3 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -62,11 +62,14 @@
 #define FANOTIFY_MARK_CMD_BITS	(FAN_MARK_ADD | FAN_MARK_REMOVE | \
 				 FAN_MARK_FLUSH)
 
+#define FANOTIFY_MARK_IGNORE_BITS (FAN_MARK_IGNORED_MASK | \
+				   FAN_MARK_IGNORE)
+
 #define FANOTIFY_MARK_FLAGS	(FANOTIFY_MARK_TYPE_BITS | \
 				 FANOTIFY_MARK_CMD_BITS | \
+				 FANOTIFY_MARK_IGNORE_BITS | \
 				 FAN_MARK_DONT_FOLLOW | \
 				 FAN_MARK_ONLYDIR | \
-				 FAN_MARK_IGNORED_MASK | \
 				 FAN_MARK_IGNORED_SURV_MODIFY | \
 				 FAN_MARK_EVICTABLE)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..d8536d77fea1 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -83,12 +83,20 @@
 #define FAN_MARK_FLUSH		0x00000080
 /* FAN_MARK_FILESYSTEM is	0x00000100 */
 #define FAN_MARK_EVICTABLE	0x00000200
+/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */
+#define FAN_MARK_IGNORE		0x00000400
 
 /* These are NOT bitwise flags.  Both bits can be used togther.  */
 #define FAN_MARK_INODE		0x00000000
 #define FAN_MARK_MOUNT		0x00000010
 #define FAN_MARK_FILESYSTEM	0x00000100
 
+/*
+ * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY
+ * for non-inode mark types.
+ */
+#define FAN_MARK_IGNORE_SURV	(FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY)
+
 /* Deprecated - do not use this in programs and do not add new flags here! */
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
-- 
cgit v1.2.3


From 5e25c25aa2c08fb9a79476e029c0b1e3dcd70566 Mon Sep 17 00:00:00 2001
From: Petr Vaněk <arkamar@atlas.cz>
Date: Thu, 30 Jun 2022 16:27:20 +0200
Subject: xfrm: improve wording of comment above XFRM_OFFLOAD flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I have noticed a few minor wording issues in a comment recently added
above XFRM_OFFLOAD flags in 7c76ecd9c99b ("xfrm: enforce validity of
offload input flags").

Signed-off-by: Petr Vaněk <arkamar@atlas.cz>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 3ed61df9cc91..7929bf9cbee4 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -511,9 +511,9 @@ struct xfrm_user_offload {
 	int				ifindex;
 	__u8				flags;
 };
-/* This flag was exposed without any kernel code that supporting it.
- * Unfortunately, strongswan has the code that uses sets this flag,
- * which makes impossible to reuse this bit.
+/* This flag was exposed without any kernel code that supports it.
+ * Unfortunately, strongswan has the code that sets this flag,
+ * which makes it impossible to reuse this bit.
  *
  * So leave it here to make sure that it won't be reused by mistake.
  */
-- 
cgit v1.2.3


From 3c660a5d86f4c01cf641bfea004a49f5860a5bed Mon Sep 17 00:00:00 2001
From: Daniel Müller <deso@posteo.net>
Date: Tue, 28 Jun 2022 16:01:18 +0000
Subject: bpf: Introduce TYPE_MATCH related constants/macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to provide type match support we require a new type of
relocation which, in turn, requires toolchain support. Recent LLVM/Clang
versions support a new value for the last argument to the
__builtin_preserve_type_info builtin, for example.
With this change we introduce the necessary constants into relevant
header files, mirroring what the compiler may support.

Signed-off-by: Daniel Müller <deso@posteo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220628160127.607834-2-deso@posteo.net
---
 include/uapi/linux/bpf.h       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 tools/lib/bpf/bpf_core_read.h  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ad9e7311c4cf..379e68fb866f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind {
 	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
 	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+	BPF_CORE_TYPE_MATCHES = 12,          /* type match in target kernel */
 };
 
 /*
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ad9e7311c4cf..379e68fb866f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6786,6 +6786,7 @@ enum bpf_core_relo_kind {
 	BPF_CORE_TYPE_SIZE = 9,              /* type size in bytes */
 	BPF_CORE_ENUMVAL_EXISTS = 10,        /* enum value existence in target kernel */
 	BPF_CORE_ENUMVAL_VALUE = 11,         /* enum value integer value */
+	BPF_CORE_TYPE_MATCHES = 12,          /* type match in target kernel */
 };
 
 /*
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index fd48b1ff59ca..2308f4990e96 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -29,6 +29,7 @@ enum bpf_type_id_kind {
 enum bpf_type_info_kind {
 	BPF_TYPE_EXISTS = 0,		/* type existence in target kernel */
 	BPF_TYPE_SIZE = 1,		/* type size in target kernel */
+	BPF_TYPE_MATCHES = 2,		/* type match in target kernel */
 };
 
 /* second argument to __builtin_preserve_enum_value() built-in */
-- 
cgit v1.2.3


From 88527790c079fb1ea41cbcfa4450ee37906a2fb0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 5 Jul 2022 16:59:24 -0700
Subject: tls: rx: add sockopt for enabling optimistic decrypt with TLS 1.3

Since optimisitic decrypt may add extra load in case of retries
require socket owner to explicitly opt-in.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst | 18 ++++++++++
 include/linux/sockptr.h          |  8 +++++
 include/net/tls.h                |  3 ++
 include/uapi/linux/snmp.h        |  1 +
 include/uapi/linux/tls.h         |  2 ++
 net/tls/tls_main.c               | 75 ++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_proc.c               |  1 +
 net/tls/tls_sw.c                 | 21 +++++++----
 8 files changed, 122 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index be8e10c14b05..7a6643836e42 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -239,6 +239,19 @@ for the original TCP transmission and TCP retransmissions. To the receiver
 this will look like TLS records had been tampered with and will result
 in record authentication failures.
 
+TLS_RX_EXPECT_NO_PAD
+~~~~~~~~~~~~~~~~~~~~
+
+TLS 1.3 only. Expect the sender to not pad records. This allows the data
+to be decrypted directly into user space buffers with TLS 1.3.
+
+This optimization is safe to enable only if the remote end is trusted,
+otherwise it is an attack vector to doubling the TLS processing cost.
+
+If the record decrypted turns out to had been padded or is not a data
+record it will be decrypted again into a kernel buffer without zero copy.
+Such events are counted in the ``TlsDecryptRetry`` statistic.
+
 Statistics
 ==========
 
@@ -264,3 +277,8 @@ TLS implementation exposes the following per-namespace statistics
 
 - ``TlsDeviceRxResync`` -
   number of RX resyncs sent to NICs handling cryptography
+
+- ``TlsDecryptRetry`` -
+  number of RX records which had to be re-decrypted due to
+  ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will
+  also increment for non-data records.
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index ea193414298b..d45902fb4cad 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -102,4 +102,12 @@ static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
 	return strncpy_from_user(dst, src.user, count);
 }
 
+static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
+				       size_t size)
+{
+	if (!sockptr_is_kernel(src))
+		return check_zeroed_user(src.user + offset, size);
+	return memchr_inv(src.kernel + offset, 0, size) == NULL;
+}
+
 #endif /* _LINUX_SOCKPTR_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index 8017f1703447..4fc16ca5f469 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -149,6 +149,7 @@ struct tls_sw_context_rx {
 
 	struct sk_buff *recv_pkt;
 	u8 async_capable:1;
+	u8 zc_capable:1;
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
@@ -239,6 +240,7 @@ struct tls_context {
 	u8 tx_conf:3;
 	u8 rx_conf:3;
 	u8 zerocopy_sendfile:1;
+	u8 rx_no_pad:1;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
@@ -358,6 +360,7 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
 void tls_err_abort(struct sock *sk, int err);
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
 void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
 void tls_sw_strparser_done(struct tls_context *tls_ctx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 904909d020e2..1c9152add663 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -344,6 +344,7 @@ enum
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
+	LINUX_MIN_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index bb8f80812b0b..f1157d8f4acd 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -40,6 +40,7 @@
 #define TLS_TX			1	/* Set transmit parameters */
 #define TLS_RX			2	/* Set receive parameters */
 #define TLS_TX_ZEROCOPY_RO	3	/* TX zerocopy (only sendfile now) */
+#define TLS_RX_EXPECT_NO_PAD	4	/* Attempt opportunistic zero-copy */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -162,6 +163,7 @@ enum {
 	TLS_INFO_TXCONF,
 	TLS_INFO_RXCONF,
 	TLS_INFO_ZC_RO_TX,
+	TLS_INFO_RX_NO_PAD,
 	__TLS_INFO_MAX,
 };
 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 2ffede463e4a..1b3efc96db0b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -533,6 +533,37 @@ static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
 	return 0;
 }
 
+static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
+				    int __user *optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	unsigned int value;
+	int err, len;
+
+	if (ctx->prot_info.version != TLS_1_3_VERSION)
+		return -EINVAL;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < sizeof(value))
+		return -EINVAL;
+
+	lock_sock(sk);
+	err = -EINVAL;
+	if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW)
+		value = ctx->rx_no_pad;
+	release_sock(sk);
+	if (err)
+		return err;
+
+	if (put_user(sizeof(value), optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &value, sizeof(value)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int do_tls_getsockopt(struct sock *sk, int optname,
 			     char __user *optval, int __user *optlen)
 {
@@ -547,6 +578,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
 	case TLS_TX_ZEROCOPY_RO:
 		rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
 		break;
+	case TLS_RX_EXPECT_NO_PAD:
+		rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -718,6 +752,38 @@ static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
 	return 0;
 }
 
+static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
+				    unsigned int optlen)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	u32 val;
+	int rc;
+
+	if (ctx->prot_info.version != TLS_1_3_VERSION ||
+	    sockptr_is_null(optval) || optlen < sizeof(val))
+		return -EINVAL;
+
+	rc = copy_from_sockptr(&val, optval, sizeof(val));
+	if (rc)
+		return -EFAULT;
+	if (val > 1)
+		return -EINVAL;
+	rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val));
+	if (rc < 1)
+		return rc == 0 ? -EINVAL : rc;
+
+	lock_sock(sk);
+	rc = -EINVAL;
+	if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) {
+		ctx->rx_no_pad = val;
+		tls_update_rx_zc_capable(ctx);
+		rc = 0;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			     unsigned int optlen)
 {
@@ -736,6 +802,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 		rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
 		release_sock(sk);
 		break;
+	case TLS_RX_EXPECT_NO_PAD:
+		rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
+		break;
 	default:
 		rc = -ENOPROTOOPT;
 		break;
@@ -976,6 +1045,11 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
 		if (err)
 			goto nla_failure;
 	}
+	if (ctx->rx_no_pad) {
+		err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD);
+		if (err)
+			goto nla_failure;
+	}
 
 	rcu_read_unlock();
 	nla_nest_end(skb, start);
@@ -997,6 +1071,7 @@ static size_t tls_get_info_size(const struct sock *sk)
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_RXCONF */
 		nla_total_size(sizeof(u16)) +	/* TLS_INFO_TXCONF */
 		nla_total_size(0) +		/* TLS_INFO_ZC_RO_TX */
+		nla_total_size(0) +		/* TLS_INFO_RX_NO_PAD */
 		0;
 
 	return size;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index feeceb0e4cb4..0c200000cc45 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -18,6 +18,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
+	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 2bac57684429..7592b6519953 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1601,6 +1601,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
+		TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
 
@@ -1787,7 +1788,7 @@ int tls_sw_recvmsg(struct sock *sk,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek &&
-		     prot->version != TLS_1_3_VERSION;
+		ctx->zc_capable;
 	decrypted = 0;
 	while (len && (decrypted + copied < target || ctx->recv_pkt)) {
 		struct tls_decrypt_arg darg = {};
@@ -2269,6 +2270,14 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
 	strp_check_rcv(&rx_ctx->strp);
 }
 
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx)
+{
+	struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx);
+
+	rx_ctx->zc_capable = tls_ctx->rx_no_pad ||
+		tls_ctx->prot_info.version != TLS_1_3_VERSION;
+}
+
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -2504,12 +2513,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	if (sw_ctx_rx) {
 		tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
 
-		if (crypto_info->version == TLS_1_3_VERSION)
-			sw_ctx_rx->async_capable = 0;
-		else
-			sw_ctx_rx->async_capable =
-				!!(tfm->__crt_alg->cra_flags &
-				   CRYPTO_ALG_ASYNC);
+		tls_update_rx_zc_capable(ctx);
+		sw_ctx_rx->async_capable =
+			crypto_info->version != TLS_1_3_VERSION &&
+			!!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
 
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
-- 
cgit v1.2.3


From fce7bf30d3309ea325a7ec452456f8f7c079f622 Mon Sep 17 00:00:00 2001
From: Eric Huang <jinhuieric.huang@amd.com>
Date: Fri, 24 Jun 2022 11:03:37 -0400
Subject: drm/amdkfd: add new flag for svm

It is to add new option for always keeping gpu mapping.

Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index d993a0d50994..d6bda40fbcf7 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -613,6 +613,8 @@ enum kfd_mmio_remap {
 #define KFD_IOCTL_SVM_FLAG_GPU_EXEC    0x00000010
 /* GPUs mostly read, may allow similar optimizations as RO, but writes fault */
 #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY     0x00000020
+/* Keep GPU memory mapping always valid as if XNACK is disable */
+#define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED   0x00000040
 
 /**
  * kfd_ioctl_svm_op - SVM ioctl operations
-- 
cgit v1.2.3


From bdb2c48e4b38e6dbe82533b437468999ba3ae498 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Jul 2022 15:00:38 +0100
Subject: io_uring: explicit sqe padding for ioctl commands

32 bit sqe->cmd_op is an union with 64 bit values. It's always a good
idea to do padding explicitly. Also zero check it in prep, so it can be
used in the future if needed without compatibility concerns.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e6b95a05e970af79000435166185e85b196b2ba2.1657202417.git.asml.silence@gmail.com
[axboe: turn bitwise OR into logical variant]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 2 +-
 include/uapi/linux/io_uring.h | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0d491ad15b66..cddc0e8490af 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5066,7 +5066,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
 {
 	struct io_uring_cmd *ioucmd = &req->uring_cmd;
 
-	if (sqe->rw_flags)
+	if (sqe->rw_flags || sqe->__pad1)
 		return -EINVAL;
 	ioucmd->cmd = sqe->cmd;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f10b59d6693e..0ad3da28d2fc 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -22,7 +22,10 @@ struct io_uring_sqe {
 	union {
 		__u64	off;	/* offset into file */
 		__u64	addr2;
-		__u32	cmd_op;
+		struct {
+			__u32	cmd_op;
+			__u32	__pad1;
+		};
 	};
 	union {
 		__u64	addr;	/* pointer to buffer or iovecs */
-- 
cgit v1.2.3


From f8d3da4ef8faf027261e06b7864583930dd7c7b9 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 6 Jul 2022 16:25:47 -0700
Subject: bpf: Add flags arg to bpf_dynptr_read and bpf_dynptr_write APIs

Commit 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
added the bpf_dynptr_write() and bpf_dynptr_read() APIs.

However, it will be needed for some dynptr types to pass in flags as
well (e.g. when writing to a skb, the user may like to invalidate the
hash or recompute the checksum).

This patch adds a "u64 flags" arg to the bpf_dynptr_read() and
bpf_dynptr_write() APIs before their UAPI signature freezes where
we then cannot change them anymore with a 5.19.x released kernel.

Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20220706232547.4016651-1-joannelkoong@gmail.com
---
 include/uapi/linux/bpf.h                           | 11 +++++++----
 kernel/bpf/helpers.c                               | 12 ++++++++----
 tools/include/uapi/linux/bpf.h                     | 11 +++++++----
 tools/testing/selftests/bpf/progs/dynptr_fail.c    | 10 +++++-----
 tools/testing/selftests/bpf/progs/dynptr_success.c |  4 ++--
 5 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62d..ef78e0e1a754 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efb..bb1254f07667 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
-BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+	   u32, offset, u64, flags)
 {
 	int err;
 
-	if (!src->data)
+	if (!src->data || flags)
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(src, offset, len);
@@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_PTR_TO_DYNPTR,
 	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+	   u32, len, u64, flags)
 {
 	int err;
 
-	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62d..ef78e0e1a754 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5222,22 +5222,25 @@ union bpf_attr {
  *	Return
  *		Nothing. Always succeeds.
  *
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset)
+ * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
  *	Description
  *		Read *len* bytes from *src* into *dst*, starting from *offset*
  *		into *src*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
- *		of *src*'s data, -EINVAL if *src* is an invalid dynptr.
+ *		of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ *		*flags* is not 0.
  *
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len)
+ * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
+ *		*flags* is currently unused.
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr.
+ *		is a read-only dynptr or if *flags* is not 0.
  *
  * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index d811cff73597..0a26c243e6e9 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -140,12 +140,12 @@ int use_after_invalid(void *ctx)
 
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr);
 
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	return 0;
 }
@@ -338,7 +338,7 @@ int invalid_helper2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0);
 
 	return 0;
 }
@@ -377,7 +377,7 @@ int invalid_write2(void *ctx)
 	memcpy((void *)&ptr + 8, &x, sizeof(x));
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	bpf_ringbuf_submit_dynptr(&ptr, 0);
 
@@ -473,7 +473,7 @@ int invalid_read2(void *ctx)
 	get_map_val_dynptr(&ptr);
 
 	/* this should fail */
-	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0);
+	bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index d67be48df4b2..a3a6103c8569 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -43,10 +43,10 @@ int test_read_write(void *ctx)
 	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
 
 	/* Write data into the dynptr */
-	err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data));
+	err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
 
 	/* Read the data that was written into the dynptr */
-	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0);
+	err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
 
 	/* Ensure the data we read matches the data we wrote */
 	for (i = 0; i < sizeof(read_data); i++) {
-- 
cgit v1.2.3


From 3f805f8cc23ba35679dd01446929292911c2b469 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 27 Jun 2022 08:35:25 -0700
Subject: LoadPin: Enable loading from trusted dm-verity devices

Extend LoadPin to allow loading of kernel files from trusted dm-verity [1]
devices.

This change adds the concept of trusted verity devices to LoadPin. LoadPin
maintains a list of root digests of verity devices it considers trusted.
Userspace can populate this list through an ioctl on the new LoadPin
securityfs entry 'dm-verity'. The ioctl receives a file descriptor of
a file with verity digests as parameter. Verity reads the digests from
this file after confirming that the file is located on the pinned root.
The digest file must contain one digest per line. The list of trusted
digests can only be set up once, which is typically done at boot time.

When a kernel file is read LoadPin first checks (as usual) whether the file
is located on the pinned root, if so the file can be loaded. Otherwise, if
the verity extension is enabled, LoadPin determines whether the file is
located on a verity backed device and whether the root digest of that
device is in the list of trusted digests. The file can be loaded if the
verity device has a trusted root digest.

Background:

As of now LoadPin restricts loading of kernel files to a single pinned
filesystem, typically the rootfs. This works for many systems, however it
can result in a bloated rootfs (and OTA updates) on platforms where
multiple boards with different hardware configurations use the same rootfs
image. Especially when 'optional' files are large it may be preferable to
download/install them only when they are actually needed by a given board.
Chrome OS uses Downloadable Content (DLC) [2] to deploy certain 'packages'
at runtime. As an example a DLC package could contain firmware for a
peripheral that is not present on all boards. DLCs use dm-verity to verify
the integrity of the DLC content.

[1] https://www.kernel.org/doc/html/latest/admin-guide/device-mapper/verity.html
[2] https://chromium.googlesource.com/chromiumos/platform2/+/HEAD/dlcservice/docs/developer.md

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Mike Snitzer <snitzer@kernel.org>
Link: https://lore.kernel.org/lkml/20220627083512.v7.2.I01c67af41d2f6525c6d023101671d7339a9bc8b5@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/loadpin.h |  22 ++++++
 security/loadpin/Kconfig     |  16 +++++
 security/loadpin/loadpin.c   | 167 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/loadpin.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/loadpin.h b/include/uapi/linux/loadpin.h
new file mode 100644
index 000000000000..daa6dbb8bb02
--- /dev/null
+++ b/include/uapi/linux/loadpin.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2022, Google LLC
+ */
+
+#ifndef _UAPI_LINUX_LOOP_LOADPIN_H
+#define _UAPI_LINUX_LOOP_LOADPIN_H
+
+#define LOADPIN_IOC_MAGIC	'L'
+
+/**
+ * LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS - Set up the root digests of verity devices
+ *                                          that loadpin should trust.
+ *
+ * Takes a file descriptor from which to read the root digests of trusted verity devices. The file
+ * is expected to contain a list of digests in ASCII format, with one line per digest. The ioctl
+ * must be issued on the securityfs attribute 'loadpin/dm-verity' (which can be typically found
+ * under /sys/kernel/security/loadpin/dm-verity).
+ */
+#define LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS _IOW(LOADPIN_IOC_MAGIC, 0x00, unsigned int)
+
+#endif /* _UAPI_LINUX_LOOP_LOADPIN_H */
diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
index 91be65dec2ab..70e7985b2561 100644
--- a/security/loadpin/Kconfig
+++ b/security/loadpin/Kconfig
@@ -18,3 +18,19 @@ config SECURITY_LOADPIN_ENFORCE
 	  If selected, LoadPin will enforce pinning at boot. If not
 	  selected, it can be enabled at boot with the kernel parameter
 	  "loadpin.enforce=1".
+
+config SECURITY_LOADPIN_VERITY
+	bool "Allow reading files from certain other filesystems that use dm-verity"
+	depends on SECURITY_LOADPIN && DM_VERITY=y && SECURITYFS
+	help
+	  If selected LoadPin can allow reading files from filesystems
+	  that use dm-verity. LoadPin maintains a list of verity root
+	  digests it considers trusted. A verity backed filesystem is
+	  considered trusted if its root digest is found in the list
+	  of trusted digests.
+
+	  The list of trusted verity can be populated through an ioctl
+	  on the LoadPin securityfs entry 'dm-verity'. The ioctl
+	  expects a file descriptor of a file with verity digests as
+	  parameter. The file must be located on the pinned root and
+	  contain a comma separated list of digests.
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index ad4e6756c038..6ab5f2bbf41f 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -18,6 +18,8 @@
 #include <linux/path.h>
 #include <linux/sched.h>	/* current */
 #include <linux/string_helpers.h>
+#include <linux/dm-verity-loadpin.h>
+#include <uapi/linux/loadpin.h>
 
 static void report_load(const char *origin, struct file *file, char *operation)
 {
@@ -43,6 +45,9 @@ static char *exclude_read_files[READING_MAX_ID];
 static int ignore_read_file_id[READING_MAX_ID] __ro_after_init;
 static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
+#ifdef CONFIG_SECURITY_LOADPIN_VERITY
+static bool deny_reading_verity_digests;
+#endif
 
 #ifdef CONFIG_SYSCTL
 
@@ -171,7 +176,8 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id,
 		spin_unlock(&pinned_root_spinlock);
 	}
 
-	if (IS_ERR_OR_NULL(pinned_root) || load_root != pinned_root) {
+	if (IS_ERR_OR_NULL(pinned_root) ||
+	    ((load_root != pinned_root) && !dm_verity_loadpin_is_bdev_trusted(load_root->s_bdev))) {
 		if (unlikely(!enforce)) {
 			report_load(origin, file, "pinning-ignored");
 			return 0;
@@ -237,6 +243,7 @@ static int __init loadpin_init(void)
 		enforce ? "" : "not ");
 	parse_exclude();
 	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
+
 	return 0;
 }
 
@@ -245,6 +252,164 @@ DEFINE_LSM(loadpin) = {
 	.init = loadpin_init,
 };
 
+#ifdef CONFIG_SECURITY_LOADPIN_VERITY
+
+enum loadpin_securityfs_interface_index {
+	LOADPIN_DM_VERITY,
+};
+
+static int read_trusted_verity_root_digests(unsigned int fd)
+{
+	struct fd f;
+	void *data;
+	int rc;
+	char *p, *d;
+
+	if (deny_reading_verity_digests)
+		return -EPERM;
+
+	/* The list of trusted root digests can only be set up once */
+	if (!list_empty(&dm_verity_loadpin_trusted_root_digests))
+		return -EPERM;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EINVAL;
+
+	data = kzalloc(SZ_4K, GFP_KERNEL);
+	if (!data) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = kernel_read_file(f.file, 0, (void **)&data, SZ_4K - 1, NULL, READING_POLICY);
+	if (rc < 0)
+		goto err;
+
+	p = data;
+	p[rc] = '\0';
+	p = strim(p);
+
+	p = strim(data);
+	while ((d = strsep(&p, "\n")) != NULL) {
+		int len = strlen(d);
+		struct dm_verity_loadpin_trusted_root_digest *trd;
+
+		if (len % 2) {
+			rc = -EPROTO;
+			goto err;
+		}
+
+		len /= 2;
+
+		trd = kzalloc(struct_size(trd, data, len), GFP_KERNEL);
+		if (!trd) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		if (hex2bin(trd->data, d, len)) {
+			kfree(trd);
+			rc = -EPROTO;
+			goto err;
+		}
+
+		trd->len = len;
+
+		list_add_tail(&trd->node, &dm_verity_loadpin_trusted_root_digests);
+	}
+
+	if (list_empty(&dm_verity_loadpin_trusted_root_digests)) {
+		rc = -EPROTO;
+		goto err;
+	}
+
+	kfree(data);
+	fdput(f);
+
+	return 0;
+
+err:
+	kfree(data);
+
+	/* any failure in loading/parsing invalidates the entire list */
+	{
+		struct dm_verity_loadpin_trusted_root_digest *trd, *tmp;
+
+		list_for_each_entry_safe(trd, tmp, &dm_verity_loadpin_trusted_root_digests, node) {
+			list_del(&trd->node);
+			kfree(trd);
+		}
+	}
+
+	/* disallow further attempts after reading a corrupt/invalid file */
+	deny_reading_verity_digests = true;
+
+	fdput(f);
+
+	return rc;
+}
+
+/******************************** securityfs ********************************/
+
+static long dm_verity_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	void __user *uarg = (void __user *)arg;
+	unsigned int fd;
+	int rc;
+
+	switch (cmd) {
+	case LOADPIN_IOC_SET_TRUSTED_VERITY_DIGESTS:
+		rc = copy_from_user(&fd, uarg, sizeof(fd));
+		if (rc)
+			return rc;
+
+		return read_trusted_verity_root_digests(fd);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations loadpin_dm_verity_ops = {
+	.unlocked_ioctl = dm_verity_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+};
+
+/**
+ * init_loadpin_securityfs - create the securityfs directory for LoadPin
+ *
+ * We can not put this method normally under the loadpin_init() code path since
+ * the security subsystem gets initialized before the vfs caches.
+ *
+ * Returns 0 if the securityfs directory creation was successful.
+ */
+static int __init init_loadpin_securityfs(void)
+{
+	struct dentry *loadpin_dir, *dentry;
+
+	loadpin_dir = securityfs_create_dir("loadpin", NULL);
+	if (IS_ERR(loadpin_dir)) {
+		pr_err("LoadPin: could not create securityfs dir: %ld\n",
+		       PTR_ERR(loadpin_dir));
+		return PTR_ERR(loadpin_dir);
+	}
+
+	dentry = securityfs_create_file("dm-verity", 0600, loadpin_dir,
+					(void *)LOADPIN_DM_VERITY, &loadpin_dm_verity_ops);
+	if (IS_ERR(dentry)) {
+		pr_err("LoadPin: could not create securityfs entry 'dm-verity': %ld\n",
+		       PTR_ERR(dentry));
+		return PTR_ERR(dentry);
+	}
+
+	return 0;
+}
+
+fs_initcall(init_loadpin_securityfs);
+
+#endif /* CONFIG_SECURITY_LOADPIN_VERITY */
+
 /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
 module_param(enforce, int, 0);
 MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning");
-- 
cgit v1.2.3


From 2a96271fb66c499e4a89d76a89d3d01170c10bef Mon Sep 17 00:00:00 2001
From: Siarhei Vishniakou <svv@google.com>
Date: Fri, 8 Jul 2022 21:59:23 -0700
Subject: Input: document the units for resolution of size axes

Today, the resolution of size axes is not documented. As a result, it's
not clear what the canonical interpretation of this value should be. On
Android, there is a need to calculate the size of the touch ellipse in
physical units (millimeters).

After reviewing linux source, it turned out that most of the existing
usages are already interpreting this value as "units/mm". This
documentation will make it explicit. This will help device
implementations with correctly following the linux specs, and will
ensure that the devices will work on Android without needing further
customized parameters for scaling of major/minor values.

Signed-off-by: Siarhei Vishniakou <svv@google.com>
Reviewed-by: Jeff LaBundy <jeff@labundy.com>
Link: https://lore.kernel.org/r/20220520084514.3451193-1-svv@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index ee3127461ee0..328cf545c029 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -78,10 +78,13 @@ struct input_id {
  * Note that input core does not clamp reported values to the
  * [minimum, maximum] limits, such task is left to userspace.
  *
- * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z)
- * is reported in units per millimeter (units/mm), resolution
- * for rotational axes (ABS_RX, ABS_RY, ABS_RZ) is reported
- * in units per radian.
+ * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z,
+ * ABS_MT_POSITION_X, ABS_MT_POSITION_Y) is reported in units
+ * per millimeter (units/mm), resolution for rotational axes
+ * (ABS_RX, ABS_RY, ABS_RZ) is reported in units per radian.
+ * The resolution for the size axes (ABS_MT_TOUCH_MAJOR,
+ * ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MAJOR, ABS_MT_WIDTH_MINOR)
+ * is reported in units per millimeter (units/mm).
  * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
  * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
  * units per g (units/g) and in units per degree per second
-- 
cgit v1.2.3


From faf3bfcb895037ae2a8b89d1048090c9e1291cae Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:22 -0400
Subject: vfio-pci/zdev: add function handle to clp base capability

The function handle is a system-wide unique identifier for a zPCI
device.  With zPCI instruction interpretation, the host will no
longer be executing the zPCI instructions on behalf of the guest.
As a result, the guest needs to use the real function handle in
order for firmware to associate the instruction with the proper
PCI function.  Let's provide that handle to the guest.

Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-19-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 5 +++--
 include/uapi/linux/vfio_zdev.h   | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 686f2e75e392..4f28cdd7ecd1 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -24,14 +24,15 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_base cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
-		.header.version = 1,
+		.header.version = 2,
 		.start_dma = zdev->start_dma,
 		.end_dma = zdev->end_dma,
 		.pchid = zdev->pchid,
 		.vfn = zdev->vfn,
 		.fmb_length = zdev->fmb_length,
 		.pft = zdev->pft,
-		.gid = zdev->pfgid
+		.gid = zdev->pfgid,
+		.fh = zdev->fh
 	};
 
 	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
index b4309397b6b2..78c022af3d29 100644
--- a/include/uapi/linux/vfio_zdev.h
+++ b/include/uapi/linux/vfio_zdev.h
@@ -29,6 +29,9 @@ struct vfio_device_info_cap_zpci_base {
 	__u16 fmb_length;	/* Measurement Block Length (in bytes) */
 	__u8 pft;		/* PCI Function Type */
 	__u8 gid;		/* PCI function group ID */
+	/* End of version 1 */
+	__u32 fh;		/* PCI function handle */
+	/* End of version 2 */
 };
 
 /**
-- 
cgit v1.2.3


From ba6090ff8ae01b41288be87ed9f6bed3d8cf5961 Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:23 -0400
Subject: vfio-pci/zdev: different maxstbl for interpreted devices

When doing load/store interpretation, the maximum store block length is
determined by the underlying firmware, not the host kernel API.  Reflect
that in the associated Query PCI Function Group clp capability and let
userspace decide which is appropriate to present to the guest.

Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-20-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 6 ++++--
 include/uapi/linux/vfio_zdev.h   | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 4f28cdd7ecd1..e163aa9f6144 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -45,14 +45,16 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
 {
 	struct vfio_device_info_cap_zpci_group cap = {
 		.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
-		.header.version = 1,
+		.header.version = 2,
 		.dasm = zdev->dma_mask,
 		.msi_addr = zdev->msi_addr,
 		.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
 		.mui = zdev->fmb_update,
 		.noi = zdev->max_msi,
 		.maxstbl = ZPCI_MAX_WRITE_SIZE,
-		.version = zdev->version
+		.version = zdev->version,
+		.reserved = 0,
+		.imaxstbl = zdev->maxstbl
 	};
 
 	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
index 78c022af3d29..77f2aff1f27e 100644
--- a/include/uapi/linux/vfio_zdev.h
+++ b/include/uapi/linux/vfio_zdev.h
@@ -50,6 +50,10 @@ struct vfio_device_info_cap_zpci_group {
 	__u16 noi;		/* Maximum number of MSIs */
 	__u16 maxstbl;		/* Maximum Store Block Length */
 	__u8 version;		/* Supported PCI Version */
+	/* End of version 1 */
+	__u8 reserved;
+	__u16 imaxstbl;		/* Maximum Interpreted Store Block Length */
+	/* End of version 2 */
 };
 
 /**
-- 
cgit v1.2.3


From db1c875e0539518e3d5fe9876ef50975cf4476bb Mon Sep 17 00:00:00 2001
From: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Mon, 6 Jun 2022 16:33:24 -0400
Subject: KVM: s390: add KVM_S390_ZPCI_OP to manage guest zPCI devices

The KVM_S390_ZPCI_OP ioctl provides a mechanism for managing
hardware-assisted virtualization features for s390x zPCI passthrough.
Add the first 2 operations, which can be used to enable/disable
the specified device for Adapter Event Notification interpretation.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Pierre Morel <pmorel@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Link: https://lore.kernel.org/r/20220606203325.110625-21-mjrosato@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst | 47 +++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c       | 16 ++++++++
 arch/s390/kvm/pci.c            | 85 ++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/pci.h            |  2 +
 include/uapi/linux/kvm.h       | 31 +++++++++++++++
 5 files changed, 181 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 11e00a46c610..d58354e9af8f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5802,6 +5802,53 @@ of CPUID leaf 0xD on the host.
 
 This ioctl injects an event channel interrupt directly to the guest vCPU.
 
+4.137 KVM_S390_ZPCI_OP
+--------------------
+
+:Capability: KVM_CAP_S390_ZPCI_OP
+:Architectures: s390
+:Type: vm ioctl
+:Parameters: struct kvm_s390_zpci_op (in)
+:Returns: 0 on success, <0 on error
+
+Used to manage hardware-assisted virtualization features for zPCI devices.
+
+Parameters are specified via the following structure::
+
+  struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;		/* target device */
+	__u8  op;		/* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;	/* Guest addr of interrupt bit vector */
+			__u64 sb;	/* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;	/* Number of interrupts */
+			__u8 isc;	/* Guest interrupt subclass */
+			__u8 sbo;	/* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+  };
+
+The type of operation is specified in the "op" field.
+KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event
+notification interpretation, which will allow firmware delivery of adapter
+events directly to the vm, with KVM providing a backup delivery mechanism;
+KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of
+adapter event notifications.
+
+The target zPCI function must also be specified via the "fh" field.  For the
+KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware
+delivery must be provided via the "reg_aen" struct.
+
+The "pad" and "reserved" fields may be used for future extensions and should be
+set to 0s by userspace.
+
 5. The kvm_run structure
 ========================
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4758bb731199..f214e0fc62ed 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_PROTECTED:
 		r = is_prot_virt_host();
 		break;
+	case KVM_CAP_S390_ZPCI_OP:
+		r = kvm_s390_pci_interp_allowed();
+		break;
 	default:
 		r = 0;
 	}
@@ -2629,6 +2632,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+	case KVM_S390_ZPCI_OP: {
+		struct kvm_s390_zpci_op args;
+
+		r = -EINVAL;
+		if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+			break;
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		r = kvm_s390_pci_zpci_op(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index 24211741deb0..4946fb7757d6 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -585,6 +585,91 @@ void kvm_s390_pci_clear_list(struct kvm *kvm)
 	spin_unlock(&kvm->arch.kzdev_list_lock);
 }
 
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+	struct zpci_dev *zdev = NULL;
+	struct kvm_zdev *kzdev;
+
+	spin_lock(&kvm->arch.kzdev_list_lock);
+	list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+		if (kzdev->zdev->fh == fh) {
+			zdev = kzdev->zdev;
+			break;
+		}
+	}
+	spin_unlock(&kvm->arch.kzdev_list_lock);
+
+	return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+				     struct kvm_s390_zpci_op *args)
+{
+	struct zpci_fib fib = {};
+	bool hostflag;
+
+	fib.fmt0.aibv = args->u.reg_aen.ibv;
+	fib.fmt0.isc = args->u.reg_aen.isc;
+	fib.fmt0.noi = args->u.reg_aen.noi;
+	if (args->u.reg_aen.sb != 0) {
+		fib.fmt0.aisb = args->u.reg_aen.sb;
+		fib.fmt0.aisbo = args->u.reg_aen.sbo;
+		fib.fmt0.sum = 1;
+	} else {
+		fib.fmt0.aisb = 0;
+		fib.fmt0.aisbo = 0;
+		fib.fmt0.sum = 0;
+	}
+
+	hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+	return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+	struct kvm_zdev *kzdev;
+	struct zpci_dev *zdev;
+	int r;
+
+	zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+	if (!zdev)
+		return -ENODEV;
+
+	mutex_lock(&zdev->kzdev_lock);
+	mutex_lock(&kvm->lock);
+
+	kzdev = zdev->kzdev;
+	if (!kzdev) {
+		r = -ENODEV;
+		goto out;
+	}
+	if (kzdev->kvm != kvm) {
+		r = -EPERM;
+		goto out;
+	}
+
+	switch (args->op) {
+	case KVM_S390_ZPCIOP_REG_AEN:
+		/* Fail on unknown flags */
+		if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+			r = -EINVAL;
+			break;
+		}
+		r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+		break;
+	case KVM_S390_ZPCIOP_DEREG_AEN:
+		r = kvm_s390_pci_aif_disable(zdev, false);
+		break;
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	mutex_unlock(&zdev->kzdev_lock);
+	return r;
+}
+
 int kvm_s390_pci_init(void)
 {
 	aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index fb2b91b76e0c..0351382e990f 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -59,6 +59,8 @@ void kvm_s390_pci_aen_exit(void);
 void kvm_s390_pci_init_list(struct kvm *kvm);
 void kvm_s390_pci_clear_list(struct kvm *kvm);
 
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
 int kvm_s390_pci_init(void);
 void kvm_s390_pci_exit(void);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..2f302e2287d1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1157,6 +1157,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_TSC_CONTROL 214
 #define KVM_CAP_SYSTEM_EVENT_DATA 215
 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
+#define KVM_CAP_S390_ZPCI_OP 221
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2118,4 +2119,34 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_XSAVE2 */
 #define KVM_GET_XSAVE2		  _IOR(KVMIO,  0xcf, struct kvm_xsave)
 
+/* Available with KVM_CAP_S390_ZPCI_OP */
+#define KVM_S390_ZPCI_OP         _IOW(KVMIO,  0xd1, struct kvm_s390_zpci_op)
+
+struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;               /* target device */
+	__u8  op;               /* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;      /* Guest addr of interrupt bit vector */
+			__u64 sb;       /* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;      /* Number of interrupts */
+			__u8 isc;       /* Guest interrupt subclass */
+			__u8 sbo;       /* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+};
+
+/* types for kvm_s390_zpci_op->op */
+#define KVM_S390_ZPCIOP_REG_AEN                0
+#define KVM_S390_ZPCIOP_DEREG_AEN      1
+
+/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
+#define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3


From 3c512307de4097aaaab3f4741c7a98fe88afa469 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Fri, 8 Jul 2022 17:56:07 +0530
Subject: wifi: nl80211: fix sending link ID info of associated BSS

commit dd374f84baec ("wifi: nl80211: expose link ID for associated
BSSes") used a top-level attribute to send link ID of the associated
BSS in the nested attribute NL80211_ATTR_BSS. But since NL80211_ATTR_BSS
is a nested attribute of the attributes defined in enum nl80211_bss,
define a new attribute in enum nl80211_bss and use it for sending the
link ID of the BSS.

Fixes: dd374f84baec ("wifi: nl80211: expose link ID for associated BSSes")
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20220708122607.1836958-1-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 279f9715919e..7bb1ae59f3a5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4904,6 +4904,7 @@ enum nl80211_bss_scan_width {
  *	Contains a nested array of signal strength attributes (u8, dBm),
  *	using the nesting index as the antenna number.
  * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz
+ * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8).
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -4929,6 +4930,7 @@ enum nl80211_bss {
 	NL80211_BSS_PARENT_BSSID,
 	NL80211_BSS_CHAIN_SIGNAL,
 	NL80211_BSS_FREQUENCY_OFFSET,
+	NL80211_BSS_MLO_LINK_ID,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e20d0fc9678a..22c4cf6fbb57 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9991,7 +9991,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			    (nla_put_u32(msg, NL80211_BSS_STATUS,
 					 NL80211_BSS_STATUS_ASSOCIATED) ||
 			     (wdev->valid_links &&
-			      nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))))
+			      nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID,
+					 link_id))))
 				goto nla_put_failure;
 		}
 		break;
-- 
cgit v1.2.3


From 1090c1ea2208702a2fe0e3f71d262e3097d939f6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 8 Jul 2022 19:52:52 -0700
Subject: tls: fix spelling of MIB

MIN -> MIB

Fixes: 88527790c079 ("tls: rx: add sockopt for enabling optimistic decrypt with TLS 1.3")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/snmp.h | 2 +-
 net/tls/tls_proc.c        | 2 +-
 net/tls/tls_sw.c          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1c9152add663..fd83fb9e525a 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -344,7 +344,7 @@ enum
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
-	LINUX_MIN_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
+	LINUX_MIB_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 1246e52b48f6..ede9df13c398 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -20,7 +20,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
-	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
+	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 09370f853031..e12846d1871a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1596,7 +1596,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
-		TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
+		TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
 
-- 
cgit v1.2.3


From bb56cea9abd85c22175b31d8f7c44d6c615fe526 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 8 Jul 2022 19:52:53 -0700
Subject: tls: rx: add counter for NoPad violations

As discussed with Maxim add a counter for true NoPad violations.
This should help deployments catch unexpected padded records vs
just control records which always need re-encryption.

https: //lore.kernel.org/all/b111828e6ac34baad9f4e783127eba8344ac252d.camel@nvidia.com/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/tls.rst | 4 ++++
 include/uapi/linux/snmp.h        | 1 +
 net/tls/tls_proc.c               | 1 +
 net/tls/tls_sw.c                 | 2 ++
 4 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 7a6643836e42..658ed3a71e1b 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -282,3 +282,7 @@ TLS implementation exposes the following per-namespace statistics
   number of RX records which had to be re-decrypted due to
   ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will
   also increment for non-data records.
+
+- ``TlsRxNoPadViolation`` -
+  number of data RX records which had to be re-decrypted due to
+  ``TLS_RX_EXPECT_NO_PAD`` mis-prediction.
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index fd83fb9e525a..4d7470036a8b 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -345,6 +345,7 @@ enum
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
 	LINUX_MIB_TLSDECRYPTRETRY,		/* TlsDecryptRetry */
+	LINUX_MIB_TLSRXNOPADVIOL,		/* TlsRxNoPadViolation */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index ede9df13c398..68982728f620 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -21,6 +21,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
 	SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY),
+	SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index e12846d1871a..68d79ee48a56 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1596,6 +1596,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
 		     darg->tail != TLS_RECORD_TYPE_DATA)) {
 		darg->zc = false;
+		if (!darg->tail)
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXNOPADVIOL);
 		TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTRETRY);
 		return decrypt_skb_update(sk, skb, dest, darg);
 	}
-- 
cgit v1.2.3


From 50e8ca0b675add8eb0e95938448a4a0f9fb0b6dd Mon Sep 17 00:00:00 2001
From: Eric Huang <jinhuieric.huang@amd.com>
Date: Mon, 11 Jul 2022 14:10:55 -0400
Subject: drm/amdkfd: bump KFD version for unified ctx save/restore memory

To expose unified memory for ctx save/resotre area feature
availablity to libhsakmt.

Proposed userspace:
https://patchwork.freedesktop.org/series/106218/

Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index d6bda40fbcf7..42b60198b6c5 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -36,9 +36,10 @@
  * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
  * - 1.9 - Add available memory ioctl
  * - 1.10 - Add SMI profiler event log
+ * - 1.11 - Add unified memory for ctx save/restore area
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 10
+#define KFD_IOCTL_MINOR_VERSION 11
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
cgit v1.2.3


From 1b870fa5573e260bc74d19f381ab0dd971a8d8e7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 14 Jul 2022 07:27:31 -0400
Subject: kvm: stats: tell userspace which values are boolean

Some of the statistics values exported by KVM are always only 0 or 1.
It can be useful to export this fact to userspace so that it can track
them specially (for example by polling the value every now and then to
compute a % of time spent in a specific state).

Therefore, add "boolean value" as a new "unit".  While it is not exactly
a unit, it walks and quacks like one.  In particular, using the type
would be wrong because boolean values could be instantaneous or peak
values (e.g. "is the rmap allocated?") or even two-bucket histograms
(e.g. "number of posted vs. non-posted interrupt injections").

Suggested-by: Amneesh Singh <natto@weirdnatto.in>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst |  6 ++++++
 arch/x86/kvm/x86.c             |  2 +-
 include/linux/kvm_host.h       | 11 ++++++++++-
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 11e00a46c610..48bf6e49a7de 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5657,6 +5657,7 @@ by a string of size ``name_size``.
 	#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+	#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
 
 	#define KVM_STATS_BASE_SHIFT		8
@@ -5724,6 +5725,11 @@ Bits 4-7 of ``flags`` encode the unit:
     It indicates that the statistics data is used to measure time or latency.
   * ``KVM_STATS_UNIT_CYCLES``
     It indicates that the statistics data is used to measure CPU clock cycles.
+  * ``KVM_STATS_UNIT_BOOLEAN``
+    It indicates that the statistic will always be either 0 or 1.  Boolean
+    statistics of "peak" type will never go back from 1 to 0.  Boolean
+    statistics can be linear histograms (with two buckets) but not logarithmic
+    histograms.
 
 Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the
 unit:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26d0cac32f73..0c3e85e8fce9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -298,7 +298,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
 	STATS_DESC_COUNTER(VCPU, preemption_reported),
 	STATS_DESC_COUNTER(VCPU, preemption_other),
-	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+	STATS_DESC_IBOOLEAN(VCPU, guest_mode)
 };
 
 const struct kvm_stats_header kvm_vcpu_stats_header = {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 83cf7fd842e0..90a45ef7203b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1822,6 +1822,15 @@ struct _kvm_stats_desc {
 	STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
 		KVM_STATS_BASE_POW10, 0)
 
+/* Instantaneous boolean value, read only */
+#define STATS_DESC_IBOOLEAN(SCOPE, name)				       \
+	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,		       \
+		KVM_STATS_BASE_POW10, 0)
+/* Peak (sticky) boolean value, read/write */
+#define STATS_DESC_PBOOLEAN(SCOPE, name)				       \
+	STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,		       \
+		KVM_STATS_BASE_POW10, 0)
+
 /* Cumulative time in nanosecond */
 #define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
 	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
@@ -1853,7 +1862,7 @@ struct _kvm_stats_desc {
 			HALT_POLL_HIST_COUNT),				       \
 	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,	       \
 			HALT_POLL_HIST_COUNT),				       \
-	STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking)
+	STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking)
 
 extern struct dentry *kvm_debugfs_dir;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5088bd9f1922..811897dadcae 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2083,6 +2083,7 @@ struct kvm_stats_header {
 #define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
 
 #define KVM_STATS_BASE_SHIFT		8
-- 
cgit v1.2.3


From 71f28f3136aff5890cd56de78abc673f8393cad9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 13 Jul 2022 22:07:10 +0800
Subject: ublk_drv: add io_uring based userspace block driver

This is the driver part of userspace block driver(ublk driver), the other
part is userspace daemon part(ublksrv)[1].

The two parts communicate by io_uring's IORING_OP_URING_CMD with one
shared cmd buffer for storing io command, and the buffer is read only for
ublksrv, each io command is indexed by io request tag directly, and is
written by ublk driver.

For example, when one READ io request is submitted to ublk block driver,
ublk driver stores the io command into cmd buffer first, then completes
one IORING_OP_URING_CMD for notifying ublksrv, and the URING_CMD is issued
to ublk driver beforehand by ublksrv for getting notification of any new
io request, and each URING_CMD is associated with one io request by tag.

After ublksrv gets the io command, it translates and handles the ublk io
request, such as, for the ublk-loop target, ublksrv translates the request
into same request on another file or disk, like the kernel loop block
driver. In ublksrv's implementation, the io is still handled by io_uring,
and share same ring with IORING_OP_URING_CMD command. When the target io
request is done, the same IORING_OP_URING_CMD is issued to ublk driver for
both committing io request result and getting future notification of new
io request.

Another thing done by ublk driver is to copy data between kernel io
request and ublksrv's io buffer:

1) before ubsrv handles WRITE request, copy the request's data into
   ublksrv's userspace io buffer, so that ublksrv can handle the write
   request

2) after ubsrv handles READ request, copy ublksrv's userspace io buffer
   into this READ request, then ublk driver can complete the READ request

Zero copy may be switched if mm is ready to support it.

ublk driver doesn't handle any logic of the specific user space driver,
so it is small/simple enough.

[1] ublksrv

https://github.com/ming1/ubdsrv

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220713140711.97356-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig         |    9 +
 drivers/block/Makefile        |    2 +
 drivers/block/ublk_drv.c      | 1530 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/ublk_cmd.h |  156 +++++
 4 files changed, 1697 insertions(+)
 create mode 100644 drivers/block/ublk_drv.c
 create mode 100644 include/uapi/linux/ublk_cmd.h

(limited to 'include/uapi/linux')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index fdb81f2794cd..e19fcab016ba 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -408,6 +408,15 @@ config BLK_DEV_RBD
 
 	  If unsure, say N.
 
+config BLK_DEV_UBLK
+	tristate "Userspace block driver (Experimental)"
+	select IO_URING
+	help
+	  io_uring based userspace block driver. Together with ublk server, ublk
+	  has been working well, but interface with userspace or command data
+	  definition isn't finalized yet, and might change according to future
+	  requirement, so mark is as experimental now.
+
 source "drivers/block/rnbd/Kconfig"
 
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 934a9c7c3a7c..be631352567e 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -39,4 +39,6 @@ obj-$(CONFIG_BLK_DEV_RNBD)	+= rnbd/
 
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk/
 
+obj-$(CONFIG_BLK_DEV_UBLK)			+= ublk_drv.o
+
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
new file mode 100644
index 000000000000..922a84c86fc6
--- /dev/null
+++ b/drivers/block/ublk_drv.c
@@ -0,0 +1,1530 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Userspace block device - block device which IO is handled from userspace
+ *
+ * Take full use of io_uring passthrough command for communicating with
+ * ublk userspace daemon(ublksrvd) for handling basic IO request.
+ *
+ * Copyright 2022 Ming Lei <ming.lei@redhat.com>
+ *
+ * (part of code stolen from loop.c)
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+#include <linux/writeback.h>
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include <linux/ioprio.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/cdev.h>
+#include <linux/io_uring.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <uapi/linux/ublk_cmd.h>
+
+#define UBLK_MINORS		(1U << MINORBITS)
+
+struct ublk_uring_cmd_pdu {
+	struct request *req;
+};
+
+/*
+ * io command is active: sqe cmd is received, and its cqe isn't done
+ *
+ * If the flag is set, the io command is owned by ublk driver, and waited
+ * for incoming blk-mq request from the ublk block device.
+ *
+ * If the flag is cleared, the io command will be completed, and owned by
+ * ublk server.
+ */
+#define UBLK_IO_FLAG_ACTIVE	0x01
+
+/*
+ * IO command is completed via cqe, and it is being handled by ublksrv, and
+ * not committed yet
+ *
+ * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
+ * cross verification
+ */
+#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
+
+/*
+ * IO command is aborted, so this flag is set in case of
+ * !UBLK_IO_FLAG_ACTIVE.
+ *
+ * After this flag is observed, any pending or new incoming request
+ * associated with this io command will be failed immediately
+ */
+#define UBLK_IO_FLAG_ABORTED 0x04
+
+struct ublk_io {
+	/* userspace buffer address from io cmd */
+	__u64	addr;
+	unsigned int flags;
+	int res;
+
+	struct io_uring_cmd *cmd;
+};
+
+struct ublk_queue {
+	int q_id;
+	int q_depth;
+
+	struct task_struct	*ubq_daemon;
+	char *io_cmd_buf;
+
+	unsigned long io_addr;	/* mapped vm address */
+	unsigned int max_io_sz;
+	bool abort_work_pending;
+	unsigned short nr_io_ready;	/* how many ios setup */
+	struct ublk_device *dev;
+	struct ublk_io ios[0];
+};
+
+#define UBLK_DAEMON_MONITOR_PERIOD	(5 * HZ)
+
+struct ublk_device {
+	struct gendisk		*ub_disk;
+	struct request_queue	*ub_queue;
+
+	char	*__queues;
+
+	unsigned short  queue_size;
+	unsigned short  bs_shift;
+	struct ublksrv_ctrl_dev_info	dev_info;
+
+	struct blk_mq_tag_set	tag_set;
+
+	struct cdev		cdev;
+	struct device		cdev_dev;
+
+	atomic_t		ch_open_cnt;
+	int			ub_number;
+
+	struct mutex		mutex;
+
+	struct mm_struct	*mm;
+
+	struct completion	completion;
+	unsigned int		nr_queues_ready;
+	atomic_t		nr_aborted_queues;
+
+	/*
+	 * Our ubq->daemon may be killed without any notification, so
+	 * monitor each queue's daemon periodically
+	 */
+	struct delayed_work	monitor_work;
+	struct work_struct	stop_work;
+};
+
+static dev_t ublk_chr_devt;
+static struct class *ublk_chr_class;
+
+static DEFINE_IDR(ublk_index_idr);
+static DEFINE_SPINLOCK(ublk_idr_lock);
+static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
+
+static DEFINE_MUTEX(ublk_ctl_mutex);
+
+static struct miscdevice ublk_misc;
+
+static struct ublk_device *ublk_get_device(struct ublk_device *ub)
+{
+	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
+		return ub;
+	return NULL;
+}
+
+static void ublk_put_device(struct ublk_device *ub)
+{
+	put_device(&ub->cdev_dev);
+}
+
+static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
+		int qid)
+{
+       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+}
+
+static inline bool ublk_rq_has_data(const struct request *rq)
+{
+	return rq->bio && bio_has_data(rq->bio);
+}
+
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+		int tag)
+{
+	return (struct ublksrv_io_desc *)
+		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
+}
+
+static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
+{
+	return ublk_get_queue(ub, q_id)->io_cmd_buf;
+}
+
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
+			PAGE_SIZE);
+}
+
+static int ublk_open(struct block_device *bdev, fmode_t mode)
+{
+	return 0;
+}
+
+static void ublk_release(struct gendisk *disk, fmode_t mode)
+{
+}
+
+static const struct block_device_operations ub_fops = {
+	.owner =	THIS_MODULE,
+	.open =		ublk_open,
+	.release =	ublk_release,
+};
+
+#define UBLK_MAX_PIN_PAGES	32
+
+struct ublk_map_data {
+	const struct ublk_queue *ubq;
+	const struct request *rq;
+	const struct ublk_io *io;
+	unsigned max_bytes;
+};
+
+struct ublk_io_iter {
+	struct page *pages[UBLK_MAX_PIN_PAGES];
+	unsigned pg_off;	/* offset in the 1st page in pages */
+	int nr_pages;		/* how many page pointers in pages */
+	struct bio *bio;
+	struct bvec_iter iter;
+};
+
+static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
+		unsigned max_bytes, bool to_vm)
+{
+	const unsigned total = min_t(unsigned, max_bytes,
+			PAGE_SIZE - data->pg_off +
+			((data->nr_pages - 1) << PAGE_SHIFT));
+	unsigned done = 0;
+	unsigned pg_idx = 0;
+
+	while (done < total) {
+		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
+		const unsigned int bytes = min3(bv.bv_len, total - done,
+				(unsigned)(PAGE_SIZE - data->pg_off));
+		void *bv_buf = bvec_kmap_local(&bv);
+		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
+
+		if (to_vm)
+			memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+		else
+			memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+
+		kunmap_local(pg_buf);
+		kunmap_local(bv_buf);
+
+		/* advance page array */
+		data->pg_off += bytes;
+		if (data->pg_off == PAGE_SIZE) {
+			pg_idx += 1;
+			data->pg_off = 0;
+		}
+
+		done += bytes;
+
+		/* advance bio */
+		bio_advance_iter_single(data->bio, &data->iter, bytes);
+		if (!data->iter.bi_size) {
+			data->bio = data->bio->bi_next;
+			if (data->bio == NULL)
+				break;
+			data->iter = data->bio->bi_iter;
+		}
+	}
+
+	return done;
+}
+
+static inline int ublk_copy_user_pages(struct ublk_map_data *data,
+		bool to_vm)
+{
+	const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
+	const unsigned long start_vm = data->io->addr;
+	unsigned int done = 0;
+	struct ublk_io_iter iter = {
+		.pg_off	= start_vm & (PAGE_SIZE - 1),
+		.bio	= data->rq->bio,
+		.iter	= data->rq->bio->bi_iter,
+	};
+	const unsigned int nr_pages = round_up(data->max_bytes +
+			(start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
+
+	while (done < nr_pages) {
+		const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
+				nr_pages - done);
+		unsigned i, len;
+
+		iter.nr_pages = get_user_pages_fast(start_vm +
+				(done << PAGE_SHIFT), to_pin, gup_flags,
+				iter.pages);
+		if (iter.nr_pages <= 0)
+			return done == 0 ? iter.nr_pages : done;
+		len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
+		for (i = 0; i < iter.nr_pages; i++) {
+			if (to_vm)
+				set_page_dirty(iter.pages[i]);
+			put_page(iter.pages[i]);
+		}
+		data->max_bytes -= len;
+		done += iter.nr_pages;
+	}
+
+	return done;
+}
+
+static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+	/*
+	 * no zero copy, we delay copy WRITE request data into ublksrv
+	 * context and the big benefit is that pinning pages in current
+	 * context is pretty fast, see ublk_pin_user_pages
+	 */
+	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
+		return rq_bytes;
+
+	if (ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	rq_bytes,
+		};
+
+		ublk_copy_user_pages(&data, true);
+
+		return rq_bytes - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static int ublk_unmap_io(const struct ublk_queue *ubq,
+		const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+
+	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	io->res,
+		};
+
+		WARN_ON_ONCE(io->res > rq_bytes);
+
+		ublk_copy_user_pages(&data, false);
+
+		return io->res - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static inline unsigned int ublk_req_build_flags(struct request *req)
+{
+	unsigned flags = 0;
+
+	if (req->cmd_flags & REQ_FAILFAST_DEV)
+		flags |= UBLK_IO_F_FAILFAST_DEV;
+
+	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
+		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
+
+	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
+		flags |= UBLK_IO_F_FAILFAST_DRIVER;
+
+	if (req->cmd_flags & REQ_META)
+		flags |= UBLK_IO_F_META;
+
+	if (req->cmd_flags & REQ_INTEGRITY)
+		flags |= UBLK_IO_F_INTEGRITY;
+
+	if (req->cmd_flags & REQ_FUA)
+		flags |= UBLK_IO_F_FUA;
+
+	if (req->cmd_flags & REQ_PREFLUSH)
+		flags |= UBLK_IO_F_PREFLUSH;
+
+	if (req->cmd_flags & REQ_NOUNMAP)
+		flags |= UBLK_IO_F_NOUNMAP;
+
+	if (req->cmd_flags & REQ_SWAP)
+		flags |= UBLK_IO_F_SWAP;
+
+	return flags;
+}
+
+static int ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
+{
+	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+	struct ublk_io *io = &ubq->ios[req->tag];
+	u32 ublk_op;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		ublk_op = UBLK_IO_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		ublk_op = UBLK_IO_OP_WRITE;
+		break;
+	case REQ_OP_FLUSH:
+		ublk_op = UBLK_IO_OP_FLUSH;
+		break;
+	case REQ_OP_DISCARD:
+		ublk_op = UBLK_IO_OP_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
+		break;
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	/* need to translate since kernel may change */
+	iod->op_flags = ublk_op | ublk_req_build_flags(req);
+	iod->nr_sectors = blk_rq_sectors(req);
+	iod->start_sector = blk_rq_pos(req);
+	iod->addr = io->addr;
+
+	return BLK_STS_OK;
+}
+
+static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static bool ubq_daemon_is_dying(struct ublk_queue *ubq)
+{
+	return ubq->ubq_daemon->flags & PF_EXITING;
+}
+
+/* todo: handle partial completion */
+static void ublk_complete_rq(struct request *req)
+{
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	struct ublk_io *io = &ubq->ios[req->tag];
+	unsigned int unmapped_bytes;
+
+	/* failed read IO if nothing is read */
+	if (!io->res && req_op(req) == REQ_OP_READ)
+		io->res = -EIO;
+
+	if (io->res < 0) {
+		blk_mq_end_request(req, errno_to_blk_status(io->res));
+		return;
+	}
+
+	/*
+	 * FLUSH or DISCARD usually won't return bytes returned, so end them
+	 * directly.
+	 *
+	 * Both the two needn't unmap.
+	 */
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
+		blk_mq_end_request(req, BLK_STS_OK);
+		return;
+	}
+
+	/* for READ request, writing data in iod->addr to rq buffers */
+	unmapped_bytes = ublk_unmap_io(ubq, req, io);
+
+	/*
+	 * Extremely impossible since we got data filled in just before
+	 *
+	 * Re-read simply for this unlikely case.
+	 */
+	if (unlikely(unmapped_bytes < io->res))
+		io->res = unmapped_bytes;
+
+	if (blk_update_request(req, BLK_STS_OK, io->res))
+		blk_mq_requeue_request(req, true);
+	else
+		__blk_mq_end_request(req, BLK_STS_OK);
+}
+
+/*
+ * __ublk_fail_req() may be called from abort context or ->ubq_daemon
+ * context during exiting, so lock is required.
+ *
+ * Also aborting may not be started yet, keep in mind that one failed
+ * request may be issued by block layer again.
+ */
+static void __ublk_fail_req(struct ublk_io *io, struct request *req)
+{
+	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		io->flags |= UBLK_IO_FLAG_ABORTED;
+		blk_mq_end_request(req, BLK_STS_IOERR);
+	}
+}
+
+#define UBLK_REQUEUE_DELAY_MS	3
+
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_device *ub = cmd->file->private_data;
+	struct request *req = pdu->req;
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	int tag = req->tag;
+	struct ublk_io *io = &ubq->ios[tag];
+	bool task_exiting = current != ubq->ubq_daemon ||
+		(current->flags & PF_EXITING);
+	unsigned int mapped_bytes;
+
+	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
+			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+			ublk_get_iod(ubq, req->tag)->addr);
+
+	if (unlikely(task_exiting)) {
+		blk_mq_end_request(req, BLK_STS_IOERR);
+		mod_delayed_work(system_wq, &ub->monitor_work, 0);
+		return;
+	}
+
+	mapped_bytes = ublk_map_io(ubq, req, io);
+
+	/* partially mapped, update io descriptor */
+	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+		/*
+		 * Nothing mapped, retry until we succeed.
+		 *
+		 * We may never succeed in mapping any bytes here because
+		 * of OOM. TODO: reserve one buffer with single page pinned
+		 * for providing forward progress guarantee.
+		 */
+		if (unlikely(!mapped_bytes)) {
+			blk_mq_requeue_request(req, false);
+			blk_mq_delay_kick_requeue_list(req->q,
+					UBLK_REQUEUE_DELAY_MS);
+			return;
+		}
+
+		ublk_get_iod(ubq, req->tag)->nr_sectors =
+			mapped_bytes >> 9;
+	}
+
+	/* mark this cmd owned by ublksrv */
+	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+	/*
+	 * clear ACTIVE since we are done with this sqe/cmd slot
+	 * We can only accept io cmd in case of being not active.
+	 */
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+	/* tell ublksrv one io request is coming */
+	io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
+}
+
+static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	blk_status_t res;
+
+	/* fill iod to slot in io cmd buffer */
+	res = ublk_setup_iod(ubq, rq);
+	if (unlikely(res != BLK_STS_OK))
+		return BLK_STS_IOERR;
+
+	blk_mq_start_request(bd->rq);
+
+	if (unlikely(ubq_daemon_is_dying(ubq))) {
+		mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
+		return BLK_STS_IOERR;
+	}
+
+	pdu->req = rq;
+	io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+
+	return BLK_STS_OK;
+}
+
+
+static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
+		unsigned int hctx_idx)
+{
+	struct ublk_device *ub = hctx->queue->queuedata;
+	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
+
+	hctx->driver_data = ubq;
+	return 0;
+}
+
+static const struct blk_mq_ops ublk_mq_ops = {
+	.queue_rq       = ublk_queue_rq,
+	.init_hctx	= ublk_init_hctx,
+};
+
+static int ublk_ch_open(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = container_of(inode->i_cdev,
+			struct ublk_device, cdev);
+
+	if (atomic_cmpxchg(&ub->ch_open_cnt, 0, 1) == 0) {
+		filp->private_data = ub;
+		return 0;
+	}
+	return -EBUSY;
+}
+
+static int ublk_ch_release(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = filp->private_data;
+
+	while (atomic_cmpxchg(&ub->ch_open_cnt, 1, 0) != 1)
+		cpu_relax();
+
+	filp->private_data = NULL;
+	return 0;
+}
+
+/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
+static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ublk_device *ub = filp->private_data;
+	size_t sz = vma->vm_end - vma->vm_start;
+	unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
+	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
+	int q_id, ret = 0;
+
+	mutex_lock(&ub->mutex);
+	if (!ub->mm)
+		ub->mm = current->mm;
+	if (current->mm != ub->mm)
+		ret = -EINVAL;
+	mutex_unlock(&ub->mutex);
+
+	if (ret)
+		return ret;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
+	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
+		return -EINVAL;
+
+	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
+	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
+			__func__, q_id, current->pid, vma->vm_start,
+			phys_off, (unsigned long)sz);
+
+	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+static void ublk_commit_completion(struct ublk_device *ub,
+		struct ublksrv_io_cmd *ub_cmd)
+{
+	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
+	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
+	struct ublk_io *io = &ubq->ios[tag];
+	struct request *req;
+
+	/* now this cmd slot is owned by nbd driver */
+	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+	io->res = ub_cmd->result;
+
+	/* find the io request and complete */
+	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+
+	if (req && likely(!blk_should_fake_timeout(req->q)))
+		ublk_complete_rq(req);
+}
+
+/*
+ * When ->ubq_daemon is exiting, either new request is ended immediately,
+ * or any queued io command is drained, so it is safe to abort queue
+ * lockless
+ */
+static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	int i;
+
+	if (!ublk_get_device(ub))
+		return;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
+			struct request *rq;
+
+			/*
+			 * Either we fail the request or ublk_rq_task_work_fn
+			 * will do it
+			 */
+			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
+			if (rq)
+				__ublk_fail_req(io, rq);
+		}
+	}
+	ublk_put_device(ub);
+}
+
+static void ublk_daemon_monitor_work(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, monitor_work.work);
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		if (ubq_daemon_is_dying(ubq)) {
+			schedule_work(&ub->stop_work);
+
+			/* abort queue is for making forward progress */
+			ublk_abort_queue(ub, ubq);
+		}
+	}
+
+	/*
+	 * We can't schedule monitor work after ublk_remove() is started.
+	 *
+	 * No need ub->mutex, monitor work are canceled after state is marked
+	 * as DEAD, so DEAD state is observed reliably.
+	 */
+	if (ub->dev_info.state != UBLK_S_DEV_DEAD)
+		schedule_delayed_work(&ub->monitor_work,
+				UBLK_DAEMON_MONITOR_PERIOD);
+}
+
+static void ublk_cancel_queue(struct ublk_queue *ubq)
+{
+	int i;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (io->flags & UBLK_IO_FLAG_ACTIVE)
+			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
+	}
+}
+
+/* Cancel all pending commands, must be called after del_gendisk() returns */
+static void ublk_cancel_dev(struct ublk_device *ub)
+{
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_cancel_queue(ublk_get_queue(ub, i));
+}
+
+static void ublk_stop_dev(struct ublk_device *ub)
+{
+	mutex_lock(&ub->mutex);
+	if (!disk_live(ub->ub_disk))
+		goto unlock;
+
+	del_gendisk(ub->ub_disk);
+	ub->dev_info.state = UBLK_S_DEV_DEAD;
+	ub->dev_info.ublksrv_pid = -1;
+	ublk_cancel_dev(ub);
+ unlock:
+	mutex_unlock(&ub->mutex);
+	cancel_delayed_work_sync(&ub->monitor_work);
+}
+
+static int ublk_ctrl_stop_dev(struct ublk_device *ub)
+{
+	ublk_stop_dev(ub);
+	cancel_work_sync(&ub->stop_work);
+	return 0;
+}
+
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+	return ubq->nr_io_ready == ubq->q_depth;
+}
+
+/* device can only be started after all IOs are ready */
+static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	mutex_lock(&ub->mutex);
+	ubq->nr_io_ready++;
+	if (ublk_queue_ready(ubq)) {
+		ubq->ubq_daemon = current;
+		get_task_struct(ubq->ubq_daemon);
+		ub->nr_queues_ready++;
+	}
+	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
+		complete_all(&ub->completion);
+	mutex_unlock(&ub->mutex);
+}
+
+static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
+	struct ublk_device *ub = cmd->file->private_data;
+	struct ublk_queue *ubq;
+	struct ublk_io *io;
+	u32 cmd_op = cmd->cmd_op;
+	unsigned tag = ub_cmd->tag;
+	int ret = -EINVAL;
+
+	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
+			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
+			ub_cmd->result);
+
+	if (!(issue_flags & IO_URING_F_SQE128))
+		goto out;
+
+	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+		goto out;
+
+	ubq = ublk_get_queue(ub, ub_cmd->q_id);
+	if (!ubq || ub_cmd->q_id != ubq->q_id)
+		goto out;
+
+	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+		goto out;
+
+	if (tag >= ubq->q_depth)
+		goto out;
+
+	io = &ubq->ios[tag];
+
+	/* there is pending io cmd, something must be wrong */
+	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	switch (cmd_op) {
+	case UBLK_IO_FETCH_REQ:
+		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
+		if (ublk_queue_ready(ubq)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		/*
+		 * The io is being handled by server, so COMMIT_RQ is expected
+		 * instead of FETCH_REQ
+		 */
+		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+			goto out;
+		/* FETCH_RQ has to provide IO buffer */
+		if (!ub_cmd->addr)
+			goto out;
+		io->cmd = cmd;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->addr = ub_cmd->addr;
+
+		ublk_mark_io_ready(ub, ubq);
+		break;
+	case UBLK_IO_COMMIT_AND_FETCH_REQ:
+		/* FETCH_RQ has to provide IO buffer */
+		if (!ub_cmd->addr)
+			goto out;
+		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+			goto out;
+		io->addr = ub_cmd->addr;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->cmd = cmd;
+		ublk_commit_completion(ub, ub_cmd);
+		break;
+	default:
+		goto out;
+	}
+	return -EIOCBQUEUED;
+
+ out:
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+	io_uring_cmd_done(cmd, ret, 0);
+	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
+			__func__, cmd_op, tag, ret, io->flags);
+	return -EIOCBQUEUED;
+}
+
+static const struct file_operations ublk_ch_fops = {
+	.owner = THIS_MODULE,
+	.open = ublk_ch_open,
+	.release = ublk_ch_release,
+	.llseek = no_llseek,
+	.uring_cmd = ublk_ch_uring_cmd,
+	.mmap = ublk_ch_mmap,
+};
+
+static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
+{
+	int size = ublk_queue_cmd_buf_size(ub, q_id);
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	if (ubq->ubq_daemon)
+		put_task_struct(ubq->ubq_daemon);
+	if (ubq->io_cmd_buf)
+		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+}
+
+static int ublk_init_queue(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
+	void *ptr;
+	int size;
+
+	ubq->q_id = q_id;
+	ubq->q_depth = ub->dev_info.queue_depth;
+	size = ublk_queue_cmd_buf_size(ub, q_id);
+
+	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
+	if (!ptr)
+		return -ENOMEM;
+
+	ubq->io_cmd_buf = ptr;
+	ubq->dev = ub;
+	return 0;
+}
+
+static void ublk_deinit_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int i;
+
+	if (!ub->__queues)
+		return;
+
+	for (i = 0; i < nr_queues; i++)
+		ublk_deinit_queue(ub, i);
+	kfree(ub->__queues);
+}
+
+static int ublk_init_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int depth = ub->dev_info.queue_depth;
+	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
+	int i, ret = -ENOMEM;
+
+	ub->queue_size = ubq_size;
+	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
+	if (!ub->__queues)
+		return ret;
+
+	for (i = 0; i < nr_queues; i++) {
+		if (ublk_init_queue(ub, i))
+			goto fail;
+	}
+
+	init_completion(&ub->completion);
+	return 0;
+
+ fail:
+	ublk_deinit_queues(ub);
+	return ret;
+}
+
+static int __ublk_alloc_dev_number(struct ublk_device *ub, int idx)
+{
+	int i = idx;
+	int err;
+
+	spin_lock(&ublk_idr_lock);
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (i >= 0) {
+		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
+	}
+	spin_unlock(&ublk_idr_lock);
+
+	if (err >= 0)
+		ub->ub_number = err;
+
+	return err;
+}
+
+static struct ublk_device *__ublk_create_dev(int idx)
+{
+	struct ublk_device *ub = NULL;
+	int ret;
+
+	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+	if (!ub)
+		return ERR_PTR(-ENOMEM);
+
+	ret = __ublk_alloc_dev_number(ub, idx);
+	if (ret < 0) {
+		kfree(ub);
+		return ERR_PTR(ret);
+	}
+	return ub;
+}
+
+static void __ublk_destroy_dev(struct ublk_device *ub)
+{
+	spin_lock(&ublk_idr_lock);
+	idr_remove(&ublk_index_idr, ub->ub_number);
+	wake_up_all(&ublk_idr_wq);
+	spin_unlock(&ublk_idr_lock);
+
+	mutex_destroy(&ub->mutex);
+
+	kfree(ub);
+}
+
+static void ublk_cdev_rel(struct device *dev)
+{
+	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
+
+	put_disk(ub->ub_disk);
+
+	blk_mq_free_tag_set(&ub->tag_set);
+
+	ublk_deinit_queues(ub);
+
+	__ublk_destroy_dev(ub);
+}
+
+static int ublk_add_chdev(struct ublk_device *ub)
+{
+	struct device *dev = &ub->cdev_dev;
+	int minor = ub->ub_number;
+	int ret;
+
+	dev->parent = ublk_misc.this_device;
+	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
+	dev->class = ublk_chr_class;
+	dev->release = ublk_cdev_rel;
+	device_initialize(dev);
+
+	ret = dev_set_name(dev, "ublkc%d", minor);
+	if (ret)
+		goto fail;
+
+	cdev_init(&ub->cdev, &ublk_ch_fops);
+	ret = cdev_device_add(&ub->cdev, dev);
+	if (ret)
+		goto fail;
+	return 0;
+ fail:
+	put_device(dev);
+	return ret;
+}
+
+static void ublk_stop_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, stop_work);
+
+	ublk_stop_dev(ub);
+}
+
+static void ublk_update_capacity(struct ublk_device *ub)
+{
+	unsigned int max_rq_bytes;
+
+	/* make max request buffer size aligned with PAGE_SIZE */
+	max_rq_bytes = round_down(ub->dev_info.rq_max_blocks <<
+			ub->bs_shift, PAGE_SIZE);
+	ub->dev_info.rq_max_blocks = max_rq_bytes >> ub->bs_shift;
+
+	set_capacity(ub->ub_disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
+}
+
+/* add disk & cdev, cleanup everything in case of failure */
+static int ublk_add_dev(struct ublk_device *ub)
+{
+	struct gendisk *disk;
+	int err = -ENOMEM;
+	int bsize;
+
+	/* We are not ready to support zero copy */
+	ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
+
+	bsize = ub->dev_info.block_size;
+	ub->bs_shift = ilog2(bsize);
+
+	ub->dev_info.nr_hw_queues = min_t(unsigned int,
+			ub->dev_info.nr_hw_queues, nr_cpu_ids);
+
+	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
+	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
+
+	if (ublk_init_queues(ub))
+		goto out_destroy_dev;
+
+	ub->tag_set.ops = &ublk_mq_ops;
+	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
+	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
+	ub->tag_set.numa_node = NUMA_NO_NODE;
+	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ub->tag_set.driver_data = ub;
+
+	err = blk_mq_alloc_tag_set(&ub->tag_set);
+	if (err)
+		goto out_deinit_queues;
+
+	disk = ub->ub_disk = blk_mq_alloc_disk(&ub->tag_set, ub);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_cleanup_tags;
+	}
+	ub->ub_queue = ub->ub_disk->queue;
+
+	ub->ub_queue->queuedata = ub;
+
+	blk_queue_logical_block_size(ub->ub_queue, bsize);
+	blk_queue_physical_block_size(ub->ub_queue, bsize);
+	blk_queue_io_min(ub->ub_queue, bsize);
+
+	blk_queue_max_hw_sectors(ub->ub_queue, ub->dev_info.rq_max_blocks <<
+			(ub->bs_shift - 9));
+
+	ub->ub_queue->limits.discard_granularity = PAGE_SIZE;
+
+	blk_queue_max_discard_sectors(ub->ub_queue, UINT_MAX >> 9);
+	blk_queue_max_write_zeroes_sectors(ub->ub_queue, UINT_MAX >> 9);
+
+	ublk_update_capacity(ub);
+
+	disk->fops		= &ub_fops;
+	disk->private_data	= ub;
+	disk->queue		= ub->ub_queue;
+	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
+
+	mutex_init(&ub->mutex);
+
+	/* add char dev so that ublksrv daemon can be setup */
+	err = ublk_add_chdev(ub);
+	if (err)
+		return err;
+
+	/* don't expose disk now until we got start command from cdev */
+
+	return 0;
+
+out_cleanup_tags:
+	blk_mq_free_tag_set(&ub->tag_set);
+out_deinit_queues:
+	ublk_deinit_queues(ub);
+out_destroy_dev:
+	__ublk_destroy_dev(ub);
+	return err;
+}
+
+static void ublk_remove(struct ublk_device *ub)
+{
+	ublk_ctrl_stop_dev(ub);
+
+	cdev_device_del(&ub->cdev, &ub->cdev_dev);
+	put_device(&ub->cdev_dev);
+}
+
+static struct ublk_device *ublk_get_device_from_id(int idx)
+{
+	struct ublk_device *ub = NULL;
+
+	if (idx < 0)
+		return NULL;
+
+	spin_lock(&ublk_idr_lock);
+	ub = idr_find(&ublk_index_idr, idx);
+	if (ub)
+		ub = ublk_get_device(ub);
+	spin_unlock(&ublk_idr_lock);
+
+	return ub;
+}
+
+static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ret = -EINVAL;
+	int ublksrv_pid = (int)header->data[0];
+	unsigned long dev_blocks = header->data[1];
+
+	if (ublksrv_pid <= 0)
+		return ret;
+
+	wait_for_completion_interruptible(&ub->completion);
+
+	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+
+	mutex_lock(&ub->mutex);
+	if (!disk_live(ub->ub_disk)) {
+		/* We may get disk size updated */
+		if (dev_blocks) {
+			ub->dev_info.dev_blocks = dev_blocks;
+			ublk_update_capacity(ub);
+		}
+		ub->dev_info.ublksrv_pid = ublksrv_pid;
+		ret = add_disk(ub->ub_disk);
+		if (!ret)
+			ub->dev_info.state = UBLK_S_DEV_LIVE;
+	} else {
+		ret = -EEXIST;
+	}
+	mutex_unlock(&ub->mutex);
+
+	return ret;
+}
+
+static struct blk_mq_hw_ctx *ublk_get_hw_queue(struct ublk_device *ub,
+		unsigned int index)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(ub->ub_queue, hctx, i)
+		if (hctx->queue_num == index)
+			return hctx;
+	return NULL;
+}
+
+static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct blk_mq_hw_ctx *hctx;
+	struct ublk_device *ub;
+	unsigned long queue;
+	unsigned int retlen;
+	int ret;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		goto out;
+
+	ret = -EINVAL;
+	queue = header->data[0];
+	if (queue >= ub->dev_info.nr_hw_queues)
+		goto out;
+	hctx = ublk_get_hw_queue(ub, queue);
+	if (!hctx)
+		goto out;
+
+	retlen = min_t(unsigned short, header->len, cpumask_size());
+	if (copy_to_user(argp, hctx->cpumask, retlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (retlen != header->len) {
+		if (clear_user(argp + retlen, header->len - retlen)) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+	ret = 0;
+ out:
+	if (ub)
+		ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_dev_info *info,
+		void __user *argp, int idx)
+{
+	struct ublk_device *ub;
+	int ret;
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ub = __ublk_create_dev(idx);
+	if (!IS_ERR_OR_NULL(ub)) {
+		memcpy(&ub->dev_info, info, sizeof(*info));
+
+		/* update device id */
+		ub->dev_info.dev_id = ub->ub_number;
+
+		ret = ublk_add_dev(ub);
+		if (!ret) {
+			if (copy_to_user(argp, &ub->dev_info, sizeof(*info))) {
+				ublk_remove(ub);
+				ret = -EFAULT;
+			}
+		}
+	} else {
+		if (IS_ERR(ub))
+			ret = PTR_ERR(ub);
+		else
+			ret = -ENOMEM;
+	}
+	mutex_unlock(&ublk_ctl_mutex);
+
+	return ret;
+}
+
+static inline bool ublk_idr_freed(int id)
+{
+	void *ptr;
+
+	spin_lock(&ublk_idr_lock);
+	ptr = idr_find(&ublk_index_idr, id);
+	spin_unlock(&ublk_idr_lock);
+
+	return ptr == NULL;
+}
+
+static int ublk_ctrl_del_dev(int idx)
+{
+	struct ublk_device *ub;
+	int ret;
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ub = ublk_get_device_from_id(idx);
+	if (ub) {
+		ublk_remove(ub);
+		ublk_put_device(ub);
+		ret = 0;
+	} else {
+		ret = -ENODEV;
+	}
+
+	/*
+	 * Wait until the idr is removed, then it can be reused after
+	 * DEL_DEV command is returned.
+	 */
+	if (!ret)
+		wait_event(ublk_idr_wq, ublk_idr_freed(idx));
+	mutex_unlock(&ublk_ctl_mutex);
+
+	return ret;
+}
+
+
+static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
+{
+	pr_devel("%s: dev id %d flags %llx\n", __func__,
+			info->dev_id, info->flags[0]);
+	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
+			info->nr_hw_queues, info->queue_depth,
+			info->block_size, info->dev_blocks);
+}
+
+static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+
+	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
+			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
+			header->data[0], header->addr, header->len);
+}
+
+static int ublk_ctrl_cmd_validate(struct io_uring_cmd *cmd,
+		struct ublksrv_ctrl_dev_info *info)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	u32 cmd_op = cmd->cmd_op;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd_op) {
+	case UBLK_CMD_GET_DEV_INFO:
+		if (header->len < sizeof(*info) || !header->addr)
+			return -EINVAL;
+		break;
+	case UBLK_CMD_ADD_DEV:
+		if (header->len < sizeof(*info) || !header->addr)
+			return -EINVAL;
+		if (copy_from_user(info, argp, sizeof(*info)) != 0)
+			return -EFAULT;
+		ublk_dump_dev_info(info);
+		if (header->dev_id != info->dev_id) {
+			printk(KERN_WARNING "%s: cmd %x, dev id not match %u %u\n",
+					__func__, cmd_op, header->dev_id,
+					info->dev_id);
+			return -EINVAL;
+		}
+		if (header->queue_id != (u16)-1) {
+			printk(KERN_WARNING "%s: cmd %x queue_id is wrong %x\n",
+					__func__, cmd_op, header->queue_id);
+			return -EINVAL;
+		}
+		break;
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+		if ((header->len * BITS_PER_BYTE) < nr_cpu_ids)
+			return -EINVAL;
+		if (header->len & (sizeof(unsigned long)-1))
+			return -EINVAL;
+		if (!header->addr)
+			return -EINVAL;
+	};
+
+	return 0;
+}
+
+static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublksrv_ctrl_dev_info info;
+	u32 cmd_op = cmd->cmd_op;
+	struct ublk_device *ub;
+	int ret = -EINVAL;
+
+	ublk_ctrl_cmd_dump(cmd);
+
+	if (!(issue_flags & IO_URING_F_SQE128))
+		goto out;
+
+	ret = ublk_ctrl_cmd_validate(cmd, &info);
+	if (ret)
+		goto out;
+
+	ret = -ENODEV;
+	switch (cmd_op) {
+	case UBLK_CMD_START_DEV:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			ret = ublk_ctrl_start_dev(ub, cmd);
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_STOP_DEV:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			ret = ublk_ctrl_stop_dev(ub);
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_GET_DEV_INFO:
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (ub) {
+			if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
+				ret = -EFAULT;
+			else
+				ret = 0;
+			ublk_put_device(ub);
+		}
+		break;
+	case UBLK_CMD_ADD_DEV:
+		ret = ublk_ctrl_add_dev(&info, argp, header->dev_id);
+		break;
+	case UBLK_CMD_DEL_DEV:
+		ret = ublk_ctrl_del_dev(header->dev_id);
+		break;
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+		ret = ublk_ctrl_get_queue_affinity(cmd);
+		break;
+	default:
+		break;
+	};
+ out:
+	io_uring_cmd_done(cmd, ret, 0);
+	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
+			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
+	return -EIOCBQUEUED;
+}
+
+static const struct file_operations ublk_ctl_fops = {
+	.open		= nonseekable_open,
+	.uring_cmd      = ublk_ctrl_uring_cmd,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice ublk_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ublk-control",
+	.fops		= &ublk_ctl_fops,
+};
+
+static int __init ublk_init(void)
+{
+	int ret;
+
+	init_waitqueue_head(&ublk_idr_wq);
+
+	ret = misc_register(&ublk_misc);
+	if (ret)
+		return ret;
+
+	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
+	if (ret)
+		goto unregister_mis;
+
+	ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
+	if (IS_ERR(ublk_chr_class)) {
+		ret = PTR_ERR(ublk_chr_class);
+		goto free_chrdev_region;
+	}
+	return 0;
+
+free_chrdev_region:
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+unregister_mis:
+	misc_deregister(&ublk_misc);
+	return ret;
+}
+
+static void __exit ublk_exit(void)
+{
+	struct ublk_device *ub;
+	int id;
+
+	class_destroy(ublk_chr_class);
+
+	misc_deregister(&ublk_misc);
+
+	idr_for_each_entry(&ublk_index_idr, ub, id)
+		ublk_remove(ub);
+
+	idr_destroy(&ublk_index_idr);
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+}
+
+module_init(ublk_init);
+module_exit(ublk_exit);
+
+MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
new file mode 100644
index 000000000000..4f0c16ec875e
--- /dev/null
+++ b/include/uapi/linux/ublk_cmd.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef USER_BLK_DRV_CMD_INC_H
+#define USER_BLK_DRV_CMD_INC_H
+
+#include <linux/types.h>
+
+/* ublk server command definition */
+
+/*
+ * Admin commands, issued by ublk server, and handled by ublk driver.
+ */
+#define	UBLK_CMD_GET_QUEUE_AFFINITY	0x01
+#define	UBLK_CMD_GET_DEV_INFO	0x02
+#define	UBLK_CMD_ADD_DEV		0x04
+#define	UBLK_CMD_DEL_DEV		0x05
+#define	UBLK_CMD_START_DEV	0x06
+#define	UBLK_CMD_STOP_DEV	0x07
+
+/*
+ * IO commands, issued by ublk server, and handled by ublk driver.
+ *
+ * FETCH_REQ: issued via sqe(URING_CMD) beforehand for fetching IO request
+ *      from ublk driver, should be issued only when starting device. After
+ *      the associated cqe is returned, request's tag can be retrieved via
+ *      cqe->userdata.
+ *
+ * COMMIT_AND_FETCH_REQ: issued via sqe(URING_CMD) after ublkserver handled
+ *      this IO request, request's handling result is committed to ublk
+ *      driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
+ *      handled before completing io request.
+ */
+#define	UBLK_IO_FETCH_REQ		0x20
+#define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
+
+/* only ABORT means that no re-fetch */
+#define UBLK_IO_RES_OK			0
+#define UBLK_IO_RES_ABORT		(-ENODEV)
+
+#define UBLKSRV_CMD_BUF_OFFSET	0
+#define UBLKSRV_IO_BUF_OFFSET	0x80000000
+
+/* tag bit is 12bit, so at most 4096 IOs for each queue */
+#define UBLK_MAX_QUEUE_DEPTH	4096
+
+/*
+ * zero copy requires 4k block size, and can remap ublk driver's io
+ * request into ublksrv's vm space
+ */
+#define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
+
+/* device state */
+#define UBLK_S_DEV_DEAD	0
+#define UBLK_S_DEV_LIVE	1
+
+/* shipped via sqe->cmd of io_uring command */
+struct ublksrv_ctrl_cmd {
+	/* sent to which device, must be valid */
+	__u32	dev_id;
+
+	/* sent to which queue, must be -1 if the cmd isn't for queue */
+	__u16	queue_id;
+	/*
+	 * cmd specific buffer, can be IN or OUT.
+	 */
+	__u16	len;
+	__u64	addr;
+
+	/* inline data */
+	__u64	data[2];
+};
+
+struct ublksrv_ctrl_dev_info {
+	__u16	nr_hw_queues;
+	__u16	queue_depth;
+	__u16	block_size;
+	__u16	state;
+
+	__u32	rq_max_blocks;
+	__u32	dev_id;
+
+	__u64   dev_blocks;
+
+	__s32	ublksrv_pid;
+	__s32	reserved0;
+	__u64	flags[2];
+
+	/* For ublksrv internal use, invisible to ublk driver */
+	__u64	ublksrv_flags;
+	__u64	reserved1[9];
+};
+
+#define		UBLK_IO_OP_READ		0
+#define		UBLK_IO_OP_WRITE		1
+#define		UBLK_IO_OP_FLUSH		2
+#define		UBLK_IO_OP_DISCARD	3
+#define		UBLK_IO_OP_WRITE_SAME	4
+#define		UBLK_IO_OP_WRITE_ZEROES	5
+
+#define		UBLK_IO_F_FAILFAST_DEV		(1U << 8)
+#define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
+#define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
+#define		UBLK_IO_F_META			(1U << 11)
+#define		UBLK_IO_F_INTEGRITY		(1U << 12)
+#define		UBLK_IO_F_FUA			(1U << 13)
+#define		UBLK_IO_F_PREFLUSH		(1U << 14)
+#define		UBLK_IO_F_NOUNMAP		(1U << 15)
+#define		UBLK_IO_F_SWAP			(1U << 16)
+
+/*
+ * io cmd is described by this structure, and stored in share memory, indexed
+ * by request tag.
+ *
+ * The data is stored by ublk driver, and read by ublksrv after one fetch command
+ * returns.
+ */
+struct ublksrv_io_desc {
+	/* op: bit 0-7, flags: bit 8-31 */
+	__u32		op_flags;
+
+	__u32		nr_sectors;
+
+	/* start sector for this io */
+	__u64		start_sector;
+
+	/* buffer address in ublksrv daemon vm space, from ublk driver */
+	__u64		addr;
+};
+
+static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod)
+{
+	return iod->op_flags & 0xff;
+}
+
+static inline __u32 ublksrv_get_flags(const struct ublksrv_io_desc *iod)
+{
+	return iod->op_flags >> 8;
+}
+
+/* issued to ublk driver via /dev/ublkcN */
+struct ublksrv_io_cmd {
+	__u16	q_id;
+
+	/* for fetch/commit which result */
+	__u16	tag;
+
+	/* io result, it is valid for COMMIT* command only */
+	__s32	result;
+
+	/*
+	 * userspace buffer address in ublksrv daemon process, valid for
+	 * FETCH* command only
+	 */
+	__u64	addr;
+};
+
+#endif
-- 
cgit v1.2.3


From 0edb3696c1713c42f52acbd8355b545e58f782b1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 13 Jul 2022 22:07:11 +0800
Subject: ublk_drv: support to complete io command via task_work_add

Use task_work_add if it is available, since task_work_add can bring
up better performance, especially batching signaling ->ubq_daemon can
be done.

It is observed that task_work_add() can boost iops by +4% on random
4k io test. Also except for completing io command, all other code
paths are same with completing io command via
io_uring_cmd_complete_in_task.

Meantime add one flag of UBLK_F_URING_CMD_COMP_IN_TASK for comparing
the mode easily.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220713140711.97356-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 75 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/ublk_cmd.h |  6 ++++
 2 files changed, 73 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 922a84c86fc6..35fa06ee70ff 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -41,10 +41,15 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <asm/page.h>
+#include <linux/task_work.h>
 #include <uapi/linux/ublk_cmd.h>
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
+struct ublk_rq_data {
+	struct callback_head work;
+};
+
 struct ublk_uring_cmd_pdu {
 	struct request *req;
 };
@@ -91,6 +96,7 @@ struct ublk_queue {
 	int q_id;
 	int q_depth;
 
+	unsigned long flags;
 	struct task_struct	*ubq_daemon;
 	char *io_cmd_buf;
 
@@ -149,6 +155,14 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
+{
+	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
+			!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
+		return true;
+	return false;
+}
+
 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
 {
 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@@ -500,12 +514,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 
 #define UBLK_REQUEUE_DELAY_MS	3
 
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+static inline void __ublk_rq_task_work(struct request *req)
 {
-	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-	struct ublk_device *ub = cmd->file->private_data;
-	struct request *req = pdu->req;
 	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	struct ublk_device *ub = ubq->dev;
 	int tag = req->tag;
 	struct ublk_io *io = &ubq->ios[tag];
 	bool task_exiting = current != ubq->ubq_daemon ||
@@ -557,13 +569,27 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
 	io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
 }
 
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+	__ublk_rq_task_work(pdu->req);
+}
+
+static void ublk_rq_task_work_fn(struct callback_head *work)
+{
+	struct ublk_rq_data *data = container_of(work,
+			struct ublk_rq_data, work);
+	struct request *req = blk_mq_rq_from_pdu(data);
+
+	__ublk_rq_task_work(req);
+}
+
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
 	struct ublk_queue *ubq = hctx->driver_data;
 	struct request *rq = bd->rq;
-	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
-	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
 	blk_status_t res;
 
 	/* fill iod to slot in io cmd buffer */
@@ -574,16 +600,36 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(bd->rq);
 
 	if (unlikely(ubq_daemon_is_dying(ubq))) {
+ fail:
 		mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
 		return BLK_STS_IOERR;
 	}
 
-	pdu->req = rq;
-	io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	if (ublk_can_use_task_work(ubq)) {
+		struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+		enum task_work_notify_mode notify_mode = bd->last ?
+			TWA_SIGNAL_NO_IPI : TWA_NONE;
+
+		if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
+			goto fail;
+	} else {
+		struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
+		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+		pdu->req = rq;
+		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	}
 
 	return BLK_STS_OK;
 }
 
+static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+
+	if (ublk_can_use_task_work(ubq))
+		__set_notify_signal(ubq->ubq_daemon);
+}
 
 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 		unsigned int hctx_idx)
@@ -595,9 +641,20 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 	return 0;
 }
 
+static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+	init_task_work(&data->work, ublk_rq_task_work_fn);
+	return 0;
+}
+
 static const struct blk_mq_ops ublk_mq_ops = {
 	.queue_rq       = ublk_queue_rq,
+	.commit_rqs     = ublk_commit_rqs,
 	.init_hctx	= ublk_init_hctx,
+	.init_request   = ublk_init_rq,
 };
 
 static int ublk_ch_open(struct inode *inode, struct file *filp)
@@ -912,6 +969,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	void *ptr;
 	int size;
 
+	ubq->flags = ub->dev_info.flags[0];
 	ubq->q_id = q_id;
 	ubq->q_depth = ub->dev_info.queue_depth;
 	size = ublk_queue_cmd_buf_size(ub, q_id);
@@ -1099,6 +1157,7 @@ static int ublk_add_dev(struct ublk_device *ub)
 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
 	ub->tag_set.numa_node = NUMA_NO_NODE;
+	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
 	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ub->tag_set.driver_data = ub;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4f0c16ec875e..a3f5e7c21807 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -48,6 +48,12 @@
  */
 #define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
 
+/*
+ * Force to complete io cmd via io_uring_cmd_complete_in_task so that
+ * performance comparison is done easily with using task_work_add
+ */
+#define UBLK_F_URING_CMD_COMP_IN_TASK	(1UL << 1)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From 577e5b8c3924539c7a09e3e00477534f39e61829 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 8 Jun 2022 12:01:12 +0300
Subject: wifi: cfg80211: add API to add/modify/remove a link station

Add an API for adding/modifying/removing a link of a station.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  64 +++++++++++++++++
 include/uapi/linux/nl80211.h |   8 +++
 net/wireless/nl80211.c       | 168 ++++++++++++++++++++++++++++++++++++++++---
 net/wireless/rdev-ops.h      |  48 +++++++++++++
 net/wireless/trace.h         |  97 +++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3bcdd20ace66..422764881269 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1456,6 +1456,61 @@ struct sta_txpwr {
 	enum nl80211_tx_power_setting type;
 };
 
+/**
+ * struct link_station_parameters - link station parameters
+ *
+ * Used to change and create a new link station.
+ *
+ * @mld_mac: MAC address of the station
+ * @link_id: the link id (-1 for non-MLD station)
+ * @link_mac: MAC address of the link
+ * @supported_rates: supported rates in IEEE 802.11 format
+ *	(or NULL for no change)
+ * @supported_rates_len: number of supported rates
+ * @ht_capa: HT capabilities of station
+ * @vht_capa: VHT capabilities of station
+ * @opmode_notif: operating mode field from Operating Mode Notification
+ * @opmode_notif_used: information if operating mode field is used
+ * @he_capa: HE capabilities of station
+ * @he_capa_len: the length of the HE capabilities
+ * @txpwr: transmit power for an associated station
+ * @txpwr_set: txpwr field is set
+ * @he_6ghz_capa: HE 6 GHz Band capabilities of station
+ * @eht_capa: EHT capabilities of station
+ * @eht_capa_len: the length of the EHT capabilities
+ */
+struct link_station_parameters {
+	const u8 *mld_mac;
+	int link_id;
+	const u8 *link_mac;
+	const u8 *supported_rates;
+	u8 supported_rates_len;
+	const struct ieee80211_ht_cap *ht_capa;
+	const struct ieee80211_vht_cap *vht_capa;
+	u8 opmode_notif;
+	bool opmode_notif_used;
+	const struct ieee80211_he_cap_elem *he_capa;
+	u8 he_capa_len;
+	struct sta_txpwr txpwr;
+	bool txpwr_set;
+	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
+	const struct ieee80211_eht_cap_elem *eht_capa;
+	u8 eht_capa_len;
+};
+
+/**
+ * struct link_station_del_parameters - link station deletion parameters
+ *
+ * Used to delete a link station entry (or all stations).
+ *
+ * @mld_mac: MAC address of the station
+ * @link_id: the link id
+ */
+struct link_station_del_parameters {
+	const u8 *mld_mac;
+	u32 link_id;
+};
+
 /**
  * struct station_parameters - station parameters
  *
@@ -4215,6 +4270,9 @@ struct mgmt_frame_regs {
  *	radar channel.
  *	The caller is expected to set chandef pointer to NULL in order to
  *	disable background CAC/radar detection.
+ * @add_link_station: Add a link to a station.
+ * @mod_link_station: Modify a link of a station.
+ * @del_link_station: Remove a link of a station.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4560,6 +4618,12 @@ struct cfg80211_ops {
 				struct cfg80211_fils_aad *fils_aad);
 	int	(*set_radar_background)(struct wiphy *wiphy,
 					struct cfg80211_chan_def *chandef);
+	int	(*add_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_parameters *params);
+	int	(*mod_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_parameters *params);
+	int	(*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
+				    struct link_station_del_parameters *params);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7bb1ae59f3a5..37bfc934325a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1254,6 +1254,10 @@
  *	without %NL80211_ATTR_MLO_LINK_ID as an easy way to remove all links
  *	in preparation for e.g. roaming to a regular (non-MLO) AP.
  *
+ * @NL80211_CMD_ADD_LINK_STA: Add a link to an MLD station
+ * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station
+ * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1501,6 +1505,10 @@ enum nl80211_commands {
 	NL80211_CMD_ADD_LINK,
 	NL80211_CMD_REMOVE_LINK,
 
+	NL80211_CMD_ADD_LINK_STA,
+	NL80211_CMD_MODIFY_LINK_STA,
+	NL80211_CMD_REMOVE_LINK_STA,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 22c4cf6fbb57..3cf8e01e3f7e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6801,7 +6801,8 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 }
 
 static int nl80211_parse_sta_txpower_setting(struct genl_info *info,
-					     struct station_parameters *params)
+					     struct sta_txpwr *txpwr,
+					     bool *txpwr_set)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int idx;
@@ -6813,18 +6814,20 @@ static int nl80211_parse_sta_txpower_setting(struct genl_info *info,
 			return -EOPNOTSUPP;
 
 		idx = NL80211_ATTR_STA_TX_POWER_SETTING;
-		params->txpwr.type = nla_get_u8(info->attrs[idx]);
+		txpwr->type = nla_get_u8(info->attrs[idx]);
 
-		if (params->txpwr.type == NL80211_TX_POWER_LIMITED) {
+		if (txpwr->type == NL80211_TX_POWER_LIMITED) {
 			idx = NL80211_ATTR_STA_TX_POWER;
 
 			if (info->attrs[idx])
-				params->txpwr.power =
-					nla_get_s16(info->attrs[idx]);
+				txpwr->power = nla_get_s16(info->attrs[idx]);
 			else
 				return -EINVAL;
 		}
-		params->sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
+
+		*txpwr_set = true;
+	} else {
+		*txpwr_set = false;
 	}
 
 	return 0;
@@ -6837,6 +6840,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	struct station_parameters params;
 	u8 *mac_addr;
 	int err;
+	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -6930,9 +6934,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params);
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
 	if (err)
 		return err;
+	if (txpwr_set)
+		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	/* Include parameters for TDLS peer (will check later) */
 	err = nl80211_set_station_tdls(info, &params);
@@ -6975,6 +6981,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	u8 *mac_addr = NULL;
 	u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
 			 BIT(NL80211_STA_FLAG_ASSOCIATED);
+	bool txpwr_set;
 
 	memset(&params, 0, sizeof(params));
 
@@ -7085,9 +7092,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 				     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 		return -EOPNOTSUPP;
 
-	err = nl80211_parse_sta_txpower_setting(info, &params);
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr, &txpwr_set);
 	if (err)
 		return err;
+	if (txpwr_set)
+		params.sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER;
 
 	err = nl80211_parse_sta_channel_info(info, &params);
 	if (err)
@@ -15682,6 +15691,128 @@ static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static int
+nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
+			     bool add)
+{
+	struct link_station_parameters params = {};
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	int err;
+
+	if ((add && !rdev->ops->add_link_station) ||
+	    (!add && !rdev->ops->mod_link_station))
+		return -EOPNOTSUPP;
+
+	if (add && !info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (add && !info->attrs[NL80211_ATTR_MLD_ADDR])
+		return -EINVAL;
+
+	if (add && !info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_MLD_ADDR])
+		params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+
+	if (info->attrs[NL80211_ATTR_MAC]) {
+		params.link_mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
+		if (!is_valid_ether_addr(params.link_mac))
+			return -EINVAL;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);
+
+	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
+		params.supported_rates =
+			nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+		params.supported_rates_len =
+			nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+	}
+
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
+		params.vht_capa =
+			nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+		params.he_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+		params.he_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+		if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
+			params.eht_capa =
+				nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+			params.eht_capa_len =
+				nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
+
+			if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa,
+							(const u8 *)params.eht_capa,
+							params.eht_capa_len))
+				return -EINVAL;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
+		params.he_6ghz_capa =
+			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
+		params.opmode_notif_used = true;
+		params.opmode_notif =
+			nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
+	}
+
+	err = nl80211_parse_sta_txpower_setting(info, &params.txpwr,
+						&params.txpwr_set);
+	if (err)
+		return err;
+
+	if (add)
+		return rdev_add_link_station(rdev, dev, &params);
+	return rdev_mod_link_station(rdev, dev, &params);
+}
+
+static int
+nl80211_add_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	return nl80211_add_mod_link_station(skb, info, true);
+}
+
+static int
+nl80211_modify_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	return nl80211_add_mod_link_station(skb, info, false);
+}
+
+static int
+nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct link_station_del_parameters params = {};
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	if (!rdev->ops->del_link_station)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MLD_ADDR] ||
+	    !info->attrs[NL80211_ATTR_MLO_LINK_ID])
+		return -EINVAL;
+
+	params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
+	params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);
+
+	return rdev_del_link_station(rdev, dev, &params);
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -16832,6 +16963,27 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
+	{
+		.cmd = NL80211_CMD_ADD_LINK_STA,
+		.doit = nl80211_add_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
+	{
+		.cmd = NL80211_CMD_MODIFY_LINK_STA,
+		.doit = nl80211_modify_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
+	{
+		.cmd = NL80211_CMD_REMOVE_LINK_STA,
+		.doit = nl80211_remove_link_station,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index a329ba036989..6221a996c19f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1448,4 +1448,52 @@ rdev_del_intf_link(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int
+rdev_add_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->add_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_add_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline int
+rdev_mod_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->mod_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_mod_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline int
+rdev_del_link_station(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct link_station_del_parameters *params)
+{
+	int ret;
+
+	if (!rdev->ops->del_link_station)
+		return -EOPNOTSUPP;
+
+	trace_rdev_del_link_station(&rdev->wiphy, dev, params);
+	ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 65f8b814ecd0..16d0fe53fcf2 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3775,6 +3775,103 @@ TRACE_EVENT(cfg80211_assoc_comeback,
 		  WDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->timeout)
 );
 
+DECLARE_EVENT_CLASS(link_station_add_mod,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__array(u8, mld_mac, 6)
+		__array(u8, link_mac, 6)
+		__field(u32, link_id)
+		__dynamic_array(u8, supported_rates,
+				params->supported_rates_len)
+		__array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
+		__array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
+		__field(u8, opmode_notif)
+		__field(bool, opmode_notif_used)
+		__dynamic_array(u8, he_capa, params->he_capa_len)
+		__array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa))
+		__dynamic_array(u8, eht_capa, params->eht_capa_len)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		memset(__entry->mld_mac, 0, 6);
+		memset(__entry->link_mac, 0, 6);
+		if (params->mld_mac)
+			memcpy(__entry->mld_mac, params->mld_mac, 6);
+		if (params->link_mac)
+			memcpy(__entry->link_mac, params->link_mac, 6);
+		__entry->link_id = params->link_id;
+		if (params->supported_rates && params->supported_rates_len)
+			memcpy(__get_dynamic_array(supported_rates),
+			       params->supported_rates,
+			       params->supported_rates_len);
+		memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
+		if (params->ht_capa)
+			memcpy(__entry->ht_capa, params->ht_capa,
+			       sizeof(struct ieee80211_ht_cap));
+		memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
+		if (params->vht_capa)
+			memcpy(__entry->vht_capa, params->vht_capa,
+			       sizeof(struct ieee80211_vht_cap));
+		__entry->opmode_notif = params->opmode_notif;
+		__entry->opmode_notif_used = params->opmode_notif_used;
+		if (params->he_capa && params->he_capa_len)
+			memcpy(__get_dynamic_array(he_capa), params->he_capa,
+			       params->he_capa_len);
+		memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa));
+		if (params->he_6ghz_capa)
+			memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa,
+			       sizeof(struct ieee80211_he_6ghz_capa));
+		if (params->eht_capa && params->eht_capa_len)
+			memcpy(__get_dynamic_array(eht_capa), params->eht_capa,
+			       params->eht_capa_len);
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+		  ", link mac: " MAC_PR_FMT ", link id: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac),
+		  MAC_PR_ARG(link_mac), __entry->link_id)
+);
+
+DEFINE_EVENT(link_station_add_mod, rdev_add_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params)
+);
+
+DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_parameters *params),
+	TP_ARGS(wiphy, netdev, params)
+);
+
+TRACE_EVENT(rdev_del_link_station,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct link_station_del_parameters *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		__array(u8, mld_mac, 6)
+		__field(u32, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		memset(__entry->mld_mac, 0, 6);
+		if (params->mld_mac)
+			memcpy(__entry->mld_mac, params->mld_mac, 6);
+		__entry->link_id = params->link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+		  ", link id: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(mld_mac),
+		  __entry->link_id)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 4e9c3af398207d95957ae6c25290891574f2d7e8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 4 Jul 2022 15:02:33 +0200
Subject: wifi: nl80211: add EML/MLD capabilities to per-iftype capabilities

We have the per-interface type capabilities, currently for
extended capabilities, add the EML/MLD capabilities there
to have this advertised by the driver.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 12 ++++++++++--
 net/wireless/nl80211.c       |  9 +++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f9ea49e67164..bc960646973b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4993,12 +4993,16 @@ struct wiphy_vendor_command {
  *	802.11-2012 8.4.2.29 for the defined fields.
  * @extended_capabilities_mask: mask of the valid values
  * @extended_capabilities_len: length of the extended capabilities
+ * @eml_capabilities: EML capabilities (for MLO)
+ * @mld_capa_and_ops: MLD capabilities and operations (for MLO)
  */
 struct wiphy_iftype_ext_capab {
 	enum nl80211_iftype iftype;
 	const u8 *extended_capabilities;
 	const u8 *extended_capabilities_mask;
 	u8 extended_capabilities_len;
+	u16 eml_capabilities;
+	u16 mld_capa_and_ops;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 37bfc934325a..3fa586e38f88 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2368,8 +2368,10 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes:
  *	%NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA,
- *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per
- *	interface type.
+ *	%NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities and
+ *	other interface-type specific capabilities per interface type. For MLO,
+ *	%NL80211_ATTR_EML_CAPABILITY and %NL80211_ATTR_MLD_CAPA_AND_OPS are
+ *	present.
  *
  * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO
  *	groupID for monitor mode.
@@ -2709,6 +2711,9 @@ enum nl80211_commands {
  *	suites allowed as %NL80211_MAX_NR_AKM_SUITES which is the legacy maximum
  *	number prior to the introduction of this attribute.
  *
+ * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16)
+ * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16)
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3231,6 +3236,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MAX_NUM_AKM_SUITES,
 
+	NL80211_ATTR_EML_CAPABILITY,
+	NL80211_ATTR_MLD_CAPA_AND_OPS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 37ec8b3897b4..35fb2b0517d9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2867,6 +2867,15 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 					    capab->extended_capabilities_mask))
 					goto nla_put_failure;
 
+				if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO &&
+				    (nla_put_u16(msg,
+						 NL80211_ATTR_EML_CAPABILITY,
+						 capab->eml_capabilities) ||
+				     nla_put_u16(msg,
+						 NL80211_ATTR_MLD_CAPA_AND_OPS,
+						 capab->mld_capa_and_ops)))
+					goto nla_put_failure;
+
 				nla_nest_end(msg, nested_ext_capab);
 				if (state->split)
 					break;
-- 
cgit v1.2.3


From 00f6842ef41d90cc335ae4dbb00d71f4f642c712 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Mon, 7 Mar 2022 16:32:00 +0000
Subject: media: v4l: Add packed YUV 4:4:4 YUVA and YUVX pixel formats

The new YUVA and YUVX are permutations of the existing AYUV and XYUV
formats. They are use by the NXP i.MX8 ISI hardware.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Jacopo Mondi <jacopo@jmondi.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-packed-yuv.rst    | 20 ++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                 |  2 ++
 include/uapi/linux/videodev2.h                       |  2 ++
 3 files changed, 24 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
index 65520c3af7cf..bf283a1b5581 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
@@ -220,6 +220,26 @@ the second byte and Y'\ :sub:`7-0` in the third byte.
       - Y'\ :sub:`7-0`
       - X\ :sub:`7-0`
 
+    * .. _V4L2-PIX-FMT-YUVA32:
+
+      - ``V4L2_PIX_FMT_YUVA32``
+      - 'YUVA'
+
+      - Y'\ :sub:`7-0`
+      - Cb\ :sub:`7-0`
+      - Cr\ :sub:`7-0`
+      - A\ :sub:`7-0`
+
+    * .. _V4L2-PIX-FMT-YUVX32:
+
+      - ``V4L2_PIX_FMT_YUVX32``
+      - 'YUVX'
+
+      - Y'\ :sub:`7-0`
+      - Cb\ :sub:`7-0`
+      - Cr\ :sub:`7-0`
+      - X\ :sub:`7-0`
+
     * .. _V4L2-PIX-FMT-YUV24:
 
       - ``V4L2_PIX_FMT_YUV24``
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index e03362c8aef9..11f19099f33b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1296,6 +1296,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_XYUV32:	descr = "32-bit XYUV 8-8-8-8"; break;
 	case V4L2_PIX_FMT_VUYA32:	descr = "32-bit VUYA 8-8-8-8"; break;
 	case V4L2_PIX_FMT_VUYX32:	descr = "32-bit VUYX 8-8-8-8"; break;
+	case V4L2_PIX_FMT_YUVA32:	descr = "32-bit YUVA 8-8-8-8"; break;
+	case V4L2_PIX_FMT_YUVX32:	descr = "32-bit YUVX 8-8-8-8"; break;
 	case V4L2_PIX_FMT_YUV410:	descr = "Planar YUV 4:1:0"; break;
 	case V4L2_PIX_FMT_YUV420:	descr = "Planar YUV 4:2:0"; break;
 	case V4L2_PIX_FMT_HI240:	descr = "8-bit Dithered RGB (BTTV)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 5311ac4fde35..0028ab74ca7c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -593,6 +593,8 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_XYUV32  v4l2_fourcc('X', 'Y', 'U', 'V') /* 32  XYUV-8-8-8-8  */
 #define V4L2_PIX_FMT_VUYA32  v4l2_fourcc('V', 'U', 'Y', 'A') /* 32  VUYA-8-8-8-8  */
 #define V4L2_PIX_FMT_VUYX32  v4l2_fourcc('V', 'U', 'Y', 'X') /* 32  VUYX-8-8-8-8  */
+#define V4L2_PIX_FMT_YUVA32  v4l2_fourcc('Y', 'U', 'V', 'A') /* 32  YUVA-8-8-8-8  */
+#define V4L2_PIX_FMT_YUVX32  v4l2_fourcc('Y', 'U', 'V', 'X') /* 32  YUVX-8-8-8-8  */
 #define V4L2_PIX_FMT_M420    v4l2_fourcc('M', '4', '2', '0') /* 12  YUV 4:2:0 2 lines y, 1 line uv interleaved */
 
 /* two planes -- one Y, one Cr + Cb interleaved  */
-- 
cgit v1.2.3


From 718d2153ad0de0c7c0b6891eaa7f9918d68b6d5e Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Thu, 17 Mar 2022 12:37:12 +0000
Subject: media: v4l2: Make colorspace validity checks more future-proof

The helper functions that test validity of colorspace-related fields
use the last value of the corresponding enums. This isn't very
future-proof, as there's a high chance someone adding a new value may
forget to update the helpers. Add new "LAST" entries to the enumerations
to improve this, and keep them private to the kernel.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/media/v4l2-common.h    |  6 +++---
 include/uapi/linux/videodev2.h | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index 3eb202259e8c..b708d63995f4 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -563,19 +563,19 @@ static inline void v4l2_buffer_set_timestamp(struct v4l2_buffer *buf,
 static inline bool v4l2_is_colorspace_valid(__u32 colorspace)
 {
 	return colorspace > V4L2_COLORSPACE_DEFAULT &&
-	       colorspace <= V4L2_COLORSPACE_DCI_P3;
+	       colorspace < V4L2_COLORSPACE_LAST;
 }
 
 static inline bool v4l2_is_xfer_func_valid(__u32 xfer_func)
 {
 	return xfer_func > V4L2_XFER_FUNC_DEFAULT &&
-	       xfer_func <= V4L2_XFER_FUNC_SMPTE2084;
+	       xfer_func < V4L2_XFER_FUNC_LAST;
 }
 
 static inline bool v4l2_is_ycbcr_enc_valid(__u8 ycbcr_enc)
 {
 	return ycbcr_enc > V4L2_YCBCR_ENC_DEFAULT &&
-	       ycbcr_enc <= V4L2_YCBCR_ENC_SMPTE240M;
+	       ycbcr_enc < V4L2_YCBCR_ENC_LAST;
 }
 
 static inline bool v4l2_is_hsv_enc_valid(__u8 hsv_enc)
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 0028ab74ca7c..e32b9e25258d 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -245,6 +245,14 @@ enum v4l2_colorspace {
 
 	/* DCI-P3 colorspace, used by cinema projectors */
 	V4L2_COLORSPACE_DCI_P3        = 12,
+
+#ifdef __KERNEL__
+	/*
+	 * Largest supported colorspace value, assigned by the compiler, used
+	 * by the framework to check for invalid values.
+	 */
+	V4L2_COLORSPACE_LAST,
+#endif
 };
 
 /*
@@ -283,6 +291,13 @@ enum v4l2_xfer_func {
 	V4L2_XFER_FUNC_NONE        = 5,
 	V4L2_XFER_FUNC_DCI_P3      = 6,
 	V4L2_XFER_FUNC_SMPTE2084   = 7,
+#ifdef __KERNEL__
+	/*
+	 * Largest supported transfer function value, assigned by the compiler,
+	 * used by the framework to check for invalid values.
+	 */
+	V4L2_XFER_FUNC_LAST,
+#endif
 };
 
 /*
@@ -343,6 +358,13 @@ enum v4l2_ycbcr_encoding {
 
 	/* SMPTE 240M -- Obsolete HDTV */
 	V4L2_YCBCR_ENC_SMPTE240M      = 8,
+#ifdef __KERNEL__
+	/*
+	 * Largest supported encoding value, assigned by the compiler, used by
+	 * the framework to check for invalid values.
+	 */
+	V4L2_YCBCR_ENC_LAST,
+#endif
 };
 
 /*
-- 
cgit v1.2.3


From 64fe675e999c2c7d753ecaaa1349693c59ce6c11 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 8 Jul 2022 17:21:40 +0100
Subject: media: videodev2.h: add V4L2_CTRL_FLAG_DYNAMIC_ARRAY

Add a new flag that indicates that this control is a dynamically sized
array. Also document this flag.

Currently dynamically sized arrays are limited to one dimensional arrays,
but that might change in the future if there is a need for it.

The initial use-case of dynamic arrays are stateless codecs. A frame
can be divided in many slices, so you want to provide an array containing
slice information for each slice. Typically the number of slices is small,
but the standard allow for hundreds or thousands of slices. Dynamic arrays
are a good solution since sizing the array for the worst case would waste
substantial amounts of memory.

Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst   | 8 ++++++++
 Documentation/userspace-api/media/videodev2.h.rst.exceptions | 1 +
 include/uapi/linux/videodev2.h                               | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
index 88f630252d98..a20dfa2a933b 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
@@ -625,6 +625,14 @@ See also the examples in :ref:`control`.
 	``V4L2_CTRL_FLAG_GRABBED`` flag when buffers are allocated or
 	streaming is in progress since most drivers do not support changing
 	the format in that case.
+    * - ``V4L2_CTRL_FLAG_DYNAMIC_ARRAY``
+      - 0x0800
+      - This control is a dynamically sized 1-dimensional array. It
+        behaves the same as a regular array, except that the number
+	of elements as reported by the ``elems`` field is between 1 and
+	``dims[0]``. So setting the control with a differently sized
+	array will change the ``elems`` field when the control is
+	queried afterwards.
 
 Return Value
 ============
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 9cbb7a0c354a..0b91200776f8 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -379,6 +379,7 @@ replace define V4L2_CTRL_FLAG_VOLATILE control-flags
 replace define V4L2_CTRL_FLAG_HAS_PAYLOAD control-flags
 replace define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE control-flags
 replace define V4L2_CTRL_FLAG_MODIFY_LAYOUT control-flags
+replace define V4L2_CTRL_FLAG_DYNAMIC_ARRAY control-flags
 
 replace define V4L2_CTRL_FLAG_NEXT_CTRL control
 replace define V4L2_CTRL_FLAG_NEXT_COMPOUND control
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index e32b9e25258d..87ebc6baafb6 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1914,6 +1914,7 @@ struct v4l2_querymenu {
 #define V4L2_CTRL_FLAG_HAS_PAYLOAD	0x0100
 #define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE	0x0200
 #define V4L2_CTRL_FLAG_MODIFY_LAYOUT	0x0400
+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 
 /*  Query flags, to be ORed with the control ID */
 #define V4L2_CTRL_FLAG_NEXT_CTRL	0x80000000
-- 
cgit v1.2.3


From 01dcfd53335cfb5c66a6c023ac0f789a5b87ace5 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:49 +0100
Subject: media: uapi: Move parsed HEVC pixel format out of staging

Move HEVC pixel format since we are ready to stabilize the uAPI

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 5 -----
 include/media/hevc-ctrls.h                                  | 3 ---
 include/uapi/linux/videodev2.h                              | 1 +
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index 967fc803ef94..c352d91a73d8 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -215,11 +215,6 @@ Compressed Formats
 	See the :ref:`associated Codec Control IDs <v4l2-mpeg-hevc>`.
 	Buffers associated with this pixel format must contain the appropriate
 	number of macroblocks to decode a full corresponding frame.
-
-	.. note::
-
-	   This format is not yet part of the public kernel API and it
-	   is expected to change.
     * .. _V4L2-PIX-FMT-FWHT:
 
       - ``V4L2_PIX_FMT_FWHT``
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index e614cf20dfd3..3a6601a46ced 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -13,9 +13,6 @@
 
 #include <linux/videodev2.h>
 
-/* The pixel format isn't stable at the moment and will likely be renamed. */
-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-
 #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
 #define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
 #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 87ebc6baafb6..06e3f81ad855 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -736,6 +736,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_FWHT     v4l2_fourcc('F', 'W', 'H', 'T') /* Fast Walsh Hadamard Transform (vicodec) */
 #define V4L2_PIX_FMT_FWHT_STATELESS     v4l2_fourcc('S', 'F', 'W', 'H') /* Stateless FWHT (vicodec) */
 #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_CPIA1    v4l2_fourcc('C', 'P', 'I', 'A') /* cpia1 YUV */
-- 
cgit v1.2.3


From 16e2d220cdc64a1518878172dc7b50d4f60e5aac Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:51 +0100
Subject: media: uapi: Move the HEVC stateless control type out of staging

Move the HEVC stateless controls types out of staging,
and re-number them.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/videodev2.h.rst.exceptions | 5 +++++
 include/media/hevc-ctrls.h                                   | 7 -------
 include/uapi/linux/videodev2.h                               | 6 ++++++
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 0b91200776f8..2feea4a5a008 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -153,6 +153,11 @@ replace symbol V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_VP9_FRAME :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_HDR10_CLL_INFO :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_PPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS :c:type:`v4l2_ctrl_type`
 
 # V4L2 capability defines
 replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
index 3a6601a46ced..42d16e8a1050 100644
--- a/include/media/hevc-ctrls.h
+++ b/include/media/hevc-ctrls.h
@@ -22,13 +22,6 @@
 #define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
 #define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_BASE + 1017)
 
-/* enum v4l2_ctrl_type type values */
-#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-
 enum v4l2_stateless_hevc_decode_mode {
 	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
 	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 06e3f81ad855..cff2bb78b2cc 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1860,6 +1860,12 @@ enum v4l2_ctrl_type {
 
 	V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR	= 0x0260,
 	V4L2_CTRL_TYPE_VP9_FRAME		= 0x0261,
+
+	V4L2_CTRL_TYPE_HEVC_SPS			= 0x0270,
+	V4L2_CTRL_TYPE_HEVC_PPS			= 0x0271,
+	V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS	= 0x0272,
+	V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX	= 0x0273,
+	V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS	= 0x0274,
 };
 
 /*  Used in the VIDIOC_QUERYCTRL ioctl for querying controls */
-- 
cgit v1.2.3


From ca24fef0f2c857b0533f21f9a8a756f9e73d60fb Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Fri, 8 Jul 2022 17:21:56 +0100
Subject: media: uapi: move HEVC stateless controls out of staging

HEVC uAPI is used by 2 mainline drivers (Hantro, Cedrus)
and at least 2 out-of-tree drivers (rkvdec, RPi).
The uAPI has been reviewed so it is time to make it 'public' by
un-staging it.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Acked-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Tested-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../media/v4l/ext-ctrls-codec-stateless.rst        | 902 ++++++++++++++++++++
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 906 ---------------------
 .../userspace-api/media/v4l/pixfmt-compressed.rst  |   2 +-
 .../userspace-api/media/v4l/vidioc-g-ext-ctrls.rst |  20 +
 include/media/hevc-ctrls.h                         | 474 -----------
 include/media/v4l2-ctrls.h                         |   6 -
 include/uapi/linux/v4l2-controls.h                 | 459 +++++++++++
 include/uapi/linux/videodev2.h                     |   5 +
 8 files changed, 1387 insertions(+), 1387 deletions(-)
 delete mode 100644 include/media/hevc-ctrls.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
index bee73065e993..cd33857d947d 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
@@ -2048,3 +2048,905 @@ This structure contains all loop filter related parameters. See sections
       - 0x2
       - When set, the bitstream contains additional syntax elements that
         specify which mode and reference frame deltas are to be updated.
+
+.. _v4l2-codec-stateless-hevc:
+
+``V4L2_CID_STATELESS_HEVC_SPS (struct)``
+    Specifies the Sequence Parameter Set fields (as extracted from the
+    bitstream) for the associated HEVC slice data.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.3.2 "Sequence parameter set RBSP
+    semantics" of the specification.
+
+.. c:type:: v4l2_ctrl_hevc_sps
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{1.2cm}|p{9.2cm}|p{6.9cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_sps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``video_parameter_set_id``
+      - Specifies the value of the vps_video_parameter_set_id of the active VPS
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
+    * - __u8
+      - ``seq_parameter_set_id``
+      - Provides an identifier for the SPS for reference by other syntax elements
+        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
+        of H.265 specifications.
+    * - __u16
+      - ``pic_width_in_luma_samples``
+      - Specifies the width of each decoded picture in units of luma samples.
+    * - __u16
+      - ``pic_height_in_luma_samples``
+      - Specifies the height of each decoded picture in units of luma samples.
+    * - __u8
+      - ``bit_depth_luma_minus8``
+      - This value plus 8 specifies the bit depth of the samples of the luma array.
+    * - __u8
+      - ``bit_depth_chroma_minus8``
+      - This value plus 8 specifies the bit depth of the samples of the chroma arrays.
+    * - __u8
+      - ``log2_max_pic_order_cnt_lsb_minus4``
+      - Specifies the value of the variable MaxPicOrderCntLsb.
+    * - __u8
+      - ``sps_max_dec_pic_buffering_minus1``
+      - This value plus 1 specifies the maximum required size of the decoded picture buffer for
+        the coded video sequence (CVS).
+    * - __u8
+      - ``sps_max_num_reorder_pics``
+      - Indicates the maximum allowed number of pictures.
+    * - __u8
+      - ``sps_max_latency_increase_plus1``
+      - Used to signal MaxLatencyPictures, which indicates the maximum number of
+        pictures that can precede any picture in output order and follow that
+        picture in decoding order.
+    * - __u8
+      - ``log2_min_luma_coding_block_size_minus3``
+      - This value plus 3 specifies the minimum luma coding block size.
+    * - __u8
+      - ``log2_diff_max_min_luma_coding_block_size``
+      - Specifies the difference between the maximum and minimum luma coding block size.
+    * - __u8
+      - ``log2_min_luma_transform_block_size_minus2``
+      - This value plus 2 specifies the minimum luma transform block size.
+    * - __u8
+      - ``log2_diff_max_min_luma_transform_block_size``
+      - Specifies the difference between the maximum and minimum luma transform block size.
+    * - __u8
+      - ``max_transform_hierarchy_depth_inter``
+      - Specifies the maximum hierarchy depth for transform units of coding units coded
+        in inter prediction mode.
+    * - __u8
+      - ``max_transform_hierarchy_depth_intra``
+      - Specifies the maximum hierarchy depth for transform units of coding units coded in
+        intra prediction mode.
+    * - __u8
+      - ``pcm_sample_bit_depth_luma_minus1``
+      - This value plus 1 specifies the number of bits used to represent each of PCM sample values of the
+        luma component.
+    * - __u8
+      - ``pcm_sample_bit_depth_chroma_minus1``
+      - Specifies the number of bits used to represent each of PCM sample values of
+        the chroma components.
+    * - __u8
+      - ``log2_min_pcm_luma_coding_block_size_minus3``
+      - Plus 3 specifies the minimum size of coding blocks.
+    * - __u8
+      - ``log2_diff_max_min_pcm_luma_coding_block_size``
+      - Specifies the difference between the maximum and minimum size of coding blocks.
+    * - __u8
+      - ``num_short_term_ref_pic_sets``
+      - Specifies the number of st_ref_pic_set() syntax structures included in the SPS.
+    * - __u8
+      - ``num_long_term_ref_pics_sps``
+      - Specifies the number of candidate long-term reference pictures that are
+        specified in the SPS.
+    * - __u8
+      - ``chroma_format_idc``
+      - Specifies the chroma sampling.
+    * - __u8
+      - ``sps_max_sub_layers_minus1``
+      - This value plus 1 specifies the maximum number of temporal sub-layers.
+    * - __u64
+      - ``flags``
+      - See :ref:`Sequence Parameter Set Flags <hevc_sps_flags>`
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_sps_flags:
+
+``Sequence Parameter Set Flags``
+
+.. raw:: latex
+
+    \small
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_AMP_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_PCM_ENABLED``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED``
+      - 0x00000100
+      -
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_PPS (struct)``
+    Specifies the Picture Parameter Set fields (as extracted from the
+    bitstream) for the associated HEVC slice data.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.3.3 "Picture parameter set RBSP
+    semantics" of the specification.
+
+.. c:type:: v4l2_ctrl_hevc_pps
+
+.. tabularcolumns:: |p{1.2cm}|p{8.6cm}|p{7.5cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_pps
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``pic_parameter_set_id``
+      - Identifies the PPS for reference by other syntax elements.
+    * - __u8
+      - ``num_extra_slice_header_bits``
+      - Specifies the number of extra slice header bits that are present
+        in the slice header RBSP for coded pictures referring to the PPS.
+    * - __u8
+      - ``num_ref_idx_l0_default_active_minus1``
+      - This value plus 1 specifies the inferred value of num_ref_idx_l0_active_minus1.
+    * - __u8
+      - ``num_ref_idx_l1_default_active_minus1``
+      - This value plus 1 specifies the inferred value of num_ref_idx_l1_active_minus1.
+    * - __s8
+      - ``init_qp_minus26``
+      - This value plus 26 specifies the initial value of SliceQp Y for each slice
+        referring to the PPS.
+    * - __u8
+      - ``diff_cu_qp_delta_depth``
+      - Specifies the difference between the luma coding tree block size
+        and the minimum luma coding block size of coding units that
+        convey cu_qp_delta_abs and cu_qp_delta_sign_flag.
+    * - __s8
+      - ``pps_cb_qp_offset``
+      - Specifies the offsets to the luma quantization parameter Cb.
+    * - __s8
+      - ``pps_cr_qp_offset``
+      - Specifies the offsets to the luma quantization parameter Cr.
+    * - __u8
+      - ``num_tile_columns_minus1``
+      - This value plus 1 specifies the number of tile columns partitioning the picture.
+    * - __u8
+      - ``num_tile_rows_minus1``
+      - This value plus 1 specifies the number of tile rows partitioning the picture.
+    * - __u8
+      - ``column_width_minus1[20]``
+      - This value plus 1 specifies the width of the i-th tile column in units of
+        coding tree blocks.
+    * - __u8
+      - ``row_height_minus1[22]``
+      - This value plus 1 specifies the height of the i-th tile row in units of coding
+        tree blocks.
+    * - __s8
+      - ``pps_beta_offset_div2``
+      - Specifies the default deblocking parameter offsets for beta divided by 2.
+    * - __s8
+      - ``pps_tc_offset_div2``
+      - Specifies the default deblocking parameter offsets for tC divided by 2.
+    * - __u8
+      - ``log2_parallel_merge_level_minus2``
+      - This value plus 2 specifies the value of the variable Log2ParMrgLevel.
+    * - __u8
+      - ``padding[4]``
+      - Applications and drivers must set this to zero.
+    * - __u64
+      - ``flags``
+      - See :ref:`Picture Parameter Set Flags <hevc_pps_flags>`
+
+.. _hevc_pps_flags:
+
+``Picture Parameter Set Flags``
+
+.. raw:: latex
+
+    \small
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED``
+      - 0x00000100
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED``
+      - 0x00000200
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED``
+      - 0x00000400
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_TILES_ENABLED``
+      - 0x00000800
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED``
+      - 0x00001000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED``
+      - 0x00002000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED``
+      - 0x00004000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED``
+      - 0x00008000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER``
+      - 0x00010000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT``
+      - 0x00020000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT``
+      - 0x00040000
+      -
+    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
+      - 0x00080000
+      - Specifies the presence of deblocking filter control syntax elements in
+        the PPS
+    * - ``V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING``
+      - 0x00100000
+      - Specifies that tile column boundaries and likewise tile row boundaries
+        are distributed uniformly across the picture
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (struct)``
+    Specifies various slice-specific parameters, especially from the NAL unit
+    header, general slice segment header and weighted prediction parameter
+    parts of the bitstream.
+    These bitstream parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.7 "General slice segment header
+    semantics" of the specification.
+    This control is a dynamically sized 1-dimensional array,
+    V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+
+.. c:type:: v4l2_ctrl_hevc_slice_params
+
+.. raw:: latex
+
+    \scriptsize
+
+.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_slice_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u32
+      - ``bit_size``
+      - Size (in bits) of the current slice data.
+    * - __u32
+      - ``data_byte_offset``
+      - Offset (in byte) to the video data in the current slice data.
+    * - __u32
+      - ``num_entry_point_offsets``
+      - Specifies the number of entry point offset syntax elements in the slice header.
+        When the driver supports it, the ``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS``
+        must be set.
+    * - __u8
+      - ``nal_unit_type``
+      - Specifies the coding type of the slice (B, P or I).
+    * - __u8
+      - ``nuh_temporal_id_plus1``
+      - Minus 1 specifies a temporal identifier for the NAL unit.
+    * - __u8
+      - ``slice_type``
+      -
+	(V4L2_HEVC_SLICE_TYPE_I, V4L2_HEVC_SLICE_TYPE_P or
+	V4L2_HEVC_SLICE_TYPE_B).
+    * - __u8
+      - ``colour_plane_id``
+      - Specifies the colour plane associated with the current slice.
+    * - __s32
+      - ``slice_pic_order_cnt``
+      - Specifies the picture order count.
+    * - __u8
+      - ``num_ref_idx_l0_active_minus1``
+      - This value plus 1 specifies the maximum reference index for reference picture list 0
+        that may be used to decode the slice.
+    * - __u8
+      - ``num_ref_idx_l1_active_minus1``
+      - This value plus 1 specifies the maximum reference index for reference picture list 1
+        that may be used to decode the slice.
+    * - __u8
+      - ``collocated_ref_idx``
+      - Specifies the reference index of the collocated picture used for
+        temporal motion vector prediction.
+    * - __u8
+      - ``five_minus_max_num_merge_cand``
+      - Specifies the maximum number of merging motion vector prediction
+        candidates supported in the slice subtracted from 5.
+    * - __s8
+      - ``slice_qp_delta``
+      - Specifies the initial value of QpY to be used for the coding blocks in the slice.
+    * - __s8
+      - ``slice_cb_qp_offset``
+      - Specifies a difference to be added to the value of pps_cb_qp_offset.
+    * - __s8
+      - ``slice_cr_qp_offset``
+      - Specifies a difference to be added to the value of pps_cr_qp_offset.
+    * - __s8
+      - ``slice_act_y_qp_offset``
+      - Specifies the offset to the luma of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_act_cb_qp_offset``
+      - Specifies the offset to the cb of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_act_cr_qp_offset``
+      - Specifies the offset to the cr of quantization parameter qP derived in section 8.6.2
+    * - __s8
+      - ``slice_beta_offset_div2``
+      - Specifies the deblocking parameter offsets for beta divided by 2.
+    * - __s8
+      - ``slice_tc_offset_div2``
+      - Specifies the deblocking parameter offsets for tC divided by 2.
+    * - __u8
+      - ``pic_struct``
+      - Indicates whether a picture should be displayed as a frame or as one or more fields.
+    * - __u32
+      - ``slice_segment_addr``
+      - Specifies the address of the first coding tree block in the slice segment.
+    * - __u8
+      - ``ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The list of L0 reference elements as indices in the DPB.
+    * - __u8
+      - ``ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The list of L1 reference elements as indices in the DPB.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, described as st_ref_pic_set()
+        in the specification, included in the slice header or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set include in the slice header
+        or SPS. It is the number of bits in the conditional block if(long_term_ref_pics_present_flag)
+        in section 7.3.6.1 of the specification.
+    * - __u8
+      - ``padding``
+      - Applications and drivers must set this to zero.
+    * - struct :c:type:`v4l2_hevc_pred_weight_table`
+      - ``pred_weight_table``
+      - The prediction weight coefficients for inter-picture prediction.
+    * - __u64
+      - ``flags``
+      - See :ref:`Slice Parameters Flags <hevc_slice_params_flags>`
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_slice_params_flags:
+
+``Slice Parameters Flags``
+
+.. raw:: latex
+
+    \scriptsize
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED``
+      - 0x00000004
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO``
+      - 0x00000008
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT``
+      - 0x00000010
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0``
+      - 0x00000020
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV``
+      - 0x00000040
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED``
+      - 0x00000080
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED``
+      - 0x00000100
+      -
+    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT``
+      - 0x00000200
+      -
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (integer)``
+    Specifies entry point offsets in bytes.
+    This control is a dynamically sized array. The number of entry point
+    offsets is reported by the ``elems`` field.
+    This bitstream parameter is defined according to :ref:`hevc`.
+    They are described in section 7.4.7.1 "General slice segment header
+    semantics" of the specification.
+    When multiple slices are submitted in a request, the length of
+    this array must be the sum of num_entry_point_offsets of all the
+    slices in the request.
+
+``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
+    Specifies the HEVC scaling matrix parameters used for the scaling process
+    for transform coefficients.
+    These matrix and parameters are defined according to :ref:`hevc`.
+    They are described in section 7.4.5 "Scaling list data semantics" of
+    the specification.
+
+.. c:type:: v4l2_ctrl_hevc_scaling_matrix
+
+.. raw:: latex
+
+    \scriptsize
+
+.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_scaling_matrix
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``scaling_list_4x4[6][16]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_8x8[6][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_16x16[6][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_32x32[2][64]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_dc_coef_16x16[6]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+    * - __u8
+      - ``scaling_list_dc_coef_32x32[2]``
+      - Scaling list is used for the scaling process for transform
+        coefficients. The values on each scaling list are expected
+        in raster scan order.
+
+.. raw:: latex
+
+    \normalsize
+
+.. c:type:: v4l2_hevc_dpb_entry
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{1.0cm}|p{4.2cm}|p{12.1cm}|
+
+.. flat-table:: struct v4l2_hevc_dpb_entry
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u64
+      - ``timestamp``
+      - Timestamp of the V4L2 capture buffer to use as reference, used
+        with B-coded and P-coded frames. The timestamp refers to the
+	``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
+	:c:func:`v4l2_timeval_to_ns()` function to convert the struct
+	:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
+    * - __u8
+      - ``flags``
+      - Long term flag for the reference frame
+        (V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE). The flag is set as
+        described in the ITU HEVC specification chapter "8.3.2 Decoding
+        process for reference picture set".
+    * - __u8
+      - ``field_pic``
+      - Whether the reference is a field picture or a frame.
+        See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
+    * - __s32
+      - ``pic_order_cnt_val``
+      - The picture order count of the current picture.
+    * - __u8
+      - ``padding[2]``
+      - Applications and drivers must set this to zero.
+
+.. raw:: latex
+
+    \normalsize
+
+.. _hevc_dpb_field_pic_flags:
+
+``HEVC dpb field pic Flags``
+
+.. raw:: latex
+
+    \scriptsize
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME``
+      - 0
+      - (progressive) Frame
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD``
+      - 1
+      - Top field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD``
+      - 2
+      - Bottom field
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM``
+      - 3
+      - Top field, bottom field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP``
+      - 4
+      - Bottom field, top field, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP``
+      - 5
+      - Top field, bottom field, top field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM``
+      - 6
+      - Bottom field, top field, bottom field repeated, in that order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING``
+      - 7
+      - Frame doubling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING``
+      - 8
+      - Frame tripling
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM``
+      - 9
+      - Top field paired with previous bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP``
+      - 10
+      - Bottom field paired with previous top field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM``
+      - 11
+      - Top field paired with next bottom field in output order
+    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP``
+      - 12
+      - Bottom field paired with next top field in output order
+
+.. c:type:: v4l2_hevc_pred_weight_table
+
+.. raw:: latex
+
+    \footnotesize
+
+.. tabularcolumns:: |p{0.8cm}|p{10.6cm}|p{5.9cm}|
+
+.. flat-table:: struct v4l2_hevc_pred_weight_table
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s8
+      - ``delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 0.
+    * - __s8
+      - ``luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The additive offset applied to the luma prediction value for list 0.
+    * - __s8
+      - ``delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 0.
+    * - __s8
+      - ``chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 0.
+    * - __s8
+      - ``delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The difference of the weighting factor applied to the luma
+        prediction value for list 1.
+    * - __s8
+      - ``luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The additive offset applied to the luma prediction value for list 1.
+    * - __s8
+      - ``delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the weighting factor applied to the chroma
+        prediction value for list 1.
+    * - __s8
+      - ``chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
+      - The difference of the additive offset applied to the chroma
+        prediction values for list 1.
+    * - __u8
+      - ``luma_log2_weight_denom``
+      - The base 2 logarithm of the denominator for all luma weighting
+        factors.
+    * - __s8
+      - ``delta_chroma_log2_weight_denom``
+      - The difference of the base 2 logarithm of the denominator for
+        all chroma weighting factors.
+    * - __u8
+      - ``padding[6]``
+      - Applications and drivers must set this to zero.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_DECODE_MODE (enum)``
+    Specifies the decoding mode to use. Currently exposes slice-based and
+    frame-based decoding but new modes might be added later on.
+    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
+    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
+    are required to set this control in order to specify the decoding mode
+    that is expected for the buffer.
+    Drivers may expose a single or multiple decoding modes, depending
+    on what they can support.
+
+.. c:type:: v4l2_stateless_hevc_decode_mode
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{9.4cm}|p{0.6cm}|p{7.3cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED``
+      - 0
+      - Decoding is done at the slice granularity.
+        The OUTPUT buffer must contain a single slice.
+    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED``
+      - 1
+      - Decoding is done at the frame granularity.
+        The OUTPUT buffer must contain all slices needed to decode the
+        frame.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_STATELESS_HEVC_START_CODE (enum)``
+    Specifies the HEVC slice start code expected for each slice.
+    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
+    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
+    are required to set this control in order to specify the start code
+    that is expected for the buffer.
+    Drivers may expose a single or multiple start codes, depending
+    on what they can support.
+
+.. c:type:: v4l2_stateless_hevc_start_code
+
+.. tabularcolumns:: |p{9.2cm}|p{0.6cm}|p{7.5cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_STATELESS_HEVC_START_CODE_NONE``
+      - 0
+      - Selecting this value specifies that HEVC slices are passed
+        to the driver without any start code. The bitstream data should be
+        according to :ref:`hevc` 7.3.1.1 General NAL unit syntax, hence
+        contains emulation prevention bytes when required.
+    * - ``V4L2_STATELESS_HEVC_START_CODE_ANNEX_B``
+      - 1
+      - Selecting this value specifies that HEVC slices are expected
+        to be prefixed by Annex B start codes. According to :ref:`hevc`
+        valid start codes can be 3-bytes 0x000001 or 4-bytes 0x00000001.
+
+.. raw:: latex
+
+    \normalsize
+
+``V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID (integer)``
+    Specifies a priority identifier for the NAL unit, which will be applied to
+    the base layer. By default this value is set to 0 for the base layer,
+    and the next layer will have the priority ID assigned as 1, 2, 3 and so on.
+    The video encoder can't decide the priority id to be applied to a layer,
+    so this has to come from client.
+    This is applicable to H264 and valid Range is from 0 to 63.
+    Source Rec. ITU-T H.264 (06/2019); G.7.4.1.1, G.8.8.1.
+
+``V4L2_CID_MPEG_VIDEO_LTR_COUNT (integer)``
+    Specifies the maximum number of Long Term Reference (LTR) frames at any
+    given time that the encoder can keep.
+    This is applicable to the H264 and HEVC encoders.
+
+``V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX (integer)``
+    After setting this control the frame that will be queued next
+    will be marked as a Long Term Reference (LTR) frame
+    and given this LTR index which ranges from 0 to LTR_COUNT-1.
+    This is applicable to the H264 and HEVC encoders.
+    Source Rec. ITU-T H.264 (06/2019); Table 7.9
+
+``V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES (bitmask)``
+    Specifies the Long Term Reference (LTR) frame(s) to be used for
+    encoding the next frame queued after setting this control.
+    This provides a bitmask which consists of bits [0, LTR_COUNT-1].
+    This is applicable to the H264 and HEVC encoders.
+
+``V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (struct)``
+    Specifies various decode parameters, especially the references picture order
+    count (POC) for all the lists (short, long, before, current, after) and the
+    number of entries for each of them.
+    These parameters are defined according to :ref:`hevc`.
+    They are described in section 8.3 "Slice decoding process" of the
+    specification.
+
+.. c:type:: v4l2_ctrl_hevc_decode_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_hevc_decode_params
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s32
+      - ``pic_order_cnt_val``
+      - PicOrderCntVal as described in section 8.3.1 "Decoding process
+        for picture order count" of the specification.
+    * - __u16
+      - ``short_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the short-term reference picture set, of the first slice
+        described as st_ref_pic_set() in the specification, included in the slice header
+        or SPS (section 7.3.6.1).
+    * - __u16
+      - ``long_term_ref_pic_set_size``
+      - Specifies the size, in bits, of the long-term reference picture set, of the first slice
+        included in the slice header or SPS. It is the number of bits in the conditional block
+        if(long_term_ref_pics_present_flag) in section 7.3.6.1 of the specification.
+    * - __u8
+      - ``num_active_dpb_entries``
+      - The number of entries in ``dpb``.
+    * - __u8
+      - ``num_poc_st_curr_before``
+      - The number of reference pictures in the short-term set that come before
+        the current frame.
+    * - __u8
+      - ``num_poc_st_curr_after``
+      - The number of reference pictures in the short-term set that come after
+        the current frame.
+    * - __u8
+      - ``num_poc_lt_curr``
+      - The number of reference pictures in the long-term set.
+    * - __u8
+      - ``poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocStCurrBefore as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the short term before references in DPB array.
+    * - __u8
+      - ``poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocStCurrAfter as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the short term after references in DPB array.
+    * - __u8
+      - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - PocLtCurr as described in section 8.3.2 "Decoding process for reference
+        picture set": provides the index of the long term references in DPB array.
+    * - struct :c:type:`v4l2_hevc_dpb_entry`
+      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
+      - The decoded picture buffer, for meta-data about reference frames.
+    * - __u64
+      - ``flags``
+      - See :ref:`Decode Parameters Flags <hevc_decode_params_flags>`
+
+.. _hevc_decode_params_flags:
+
+``Decode Parameters Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC``
+      - 0x00000001
+      -
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC``
+      - 0x00000002
+      -
+    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR``
+      - 0x00000004
+      -
diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 6cf68588af1b..2a165ae063fb 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -2658,909 +2658,3 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     Indicates whether to generate SPS and PPS at every IDR. Setting it to 0
     disables generating SPS and PPS at every IDR. Setting it to one enables
     generating SPS and PPS at every IDR.
-
-.. _v4l2-mpeg-hevc:
-
-``V4L2_CID_STATELESS_HEVC_SPS (struct)``
-    Specifies the Sequence Parameter Set fields (as extracted from the
-    bitstream) for the associated HEVC slice data.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.3.2 "Sequence parameter set RBSP
-    semantics" of the specification.
-
-.. c:type:: v4l2_ctrl_hevc_sps
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{1.2cm}|p{9.2cm}|p{6.9cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_sps
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``video_parameter_set_id``
-      - Specifies the value of the vps_video_parameter_set_id of the active VPS
-        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
-        of H.265 specifications.
-    * - __u8
-      - ``seq_parameter_set_id``
-      - Provides an identifier for the SPS for reference by other syntax elements
-        as described in section "7.4.3.2.1 General sequence parameter set RBSP semantics"
-        of H.265 specifications.
-    * - __u16
-      - ``pic_width_in_luma_samples``
-      - Specifies the width of each decoded picture in units of luma samples.
-    * - __u16
-      - ``pic_height_in_luma_samples``
-      - Specifies the height of each decoded picture in units of luma samples.
-    * - __u8
-      - ``bit_depth_luma_minus8``
-      - This value plus 8 specifies the bit depth of the samples of the luma array.
-    * - __u8
-      - ``bit_depth_chroma_minus8``
-      - This value plus 8 specifies the bit depth of the samples of the chroma arrays.
-    * - __u8
-      - ``log2_max_pic_order_cnt_lsb_minus4``
-      - This value plus 4 specifies the value of the variable MaxPicOrderCntLsb.
-    * - __u8
-      - ``sps_max_dec_pic_buffering_minus1``
-      - This value plus 1 specifies the maximum required size of the decoded picture buffer for
-        the codec video sequence.
-    * - __u8
-      - ``sps_max_num_reorder_pics``
-      - Indicates the maximum allowed number of pictures.
-    * - __u8
-      - ``sps_max_latency_increase_plus1``
-      - Not equal to 0 is used to compute the value of SpsMaxLatencyPictures array.
-    * - __u8
-      - ``log2_min_luma_coding_block_size_minus3``
-      - This value plus 3 specifies the minimum luma coding block size.
-    * - __u8
-      - ``log2_diff_max_min_luma_coding_block_size``
-      - Specifies the difference between the maximum and minimum luma coding block size.
-    * - __u8
-      - ``log2_min_luma_transform_block_size_minus2``
-      - This value plus 2 specifies the minimum luma transform block size.
-    * - __u8
-      - ``log2_diff_max_min_luma_transform_block_size``
-      - Specifies the difference between the maximum and minimum luma transform block size.
-    * - __u8
-      - ``max_transform_hierarchy_depth_inter``
-      - Specifies the maximum hierarchy depth for transform units of coding units coded
-        in inter prediction mode.
-    * - __u8
-      - ``max_transform_hierarchy_depth_intra``
-      - Specifies the maximum hierarchy depth for transform units of coding units coded in
-        intra prediction mode.
-    * - __u8
-      - ``pcm_sample_bit_depth_luma_minus1``
-      - This value plus 1 specifies the number of bits used to represent each of PCM sample
-        values of the luma component.
-    * - __u8
-      - ``pcm_sample_bit_depth_chroma_minus1``
-      - This value plus 1 specifies the number of bits used to represent each of PCM sample
-        values of the chroma components.
-    * - __u8
-      - ``log2_min_pcm_luma_coding_block_size_minus3``
-      - This value plus 3 specifies the minimum size of coding blocks.
-    * - __u8
-      - ``log2_diff_max_min_pcm_luma_coding_block_size``
-      - Specifies the difference between the maximum and minimum size of coding blocks.
-    * - __u8
-      - ``num_short_term_ref_pic_sets``
-      - Specifies the number of st_ref_pic_set() syntax structures included in the SPS.
-    * - __u8
-      - ``num_long_term_ref_pics_sps``
-      - Specifies the number of candidate long-term reference pictures that are
-        specified in the SPS.
-    * - __u8
-      - ``chroma_format_idc``
-      - Specifies the chroma sampling.
-    * - __u8
-      - ``sps_max_sub_layers_minus1``
-      - This value plus 1 specifies the maximum number of temporal sub-layers.
-    * - __u64
-      - ``flags``
-      - See :ref:`Sequence Parameter Set Flags <hevc_sps_flags>`
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_sps_flags:
-
-``Sequence Parameter Set Flags``
-
-.. raw:: latex
-
-    \small
-
-.. cssclass:: longtable
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_AMP_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_PCM_ENABLED``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED``
-      - 0x00000100
-      -
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_PPS (struct)``
-    Specifies the Picture Parameter Set fields (as extracted from the
-    bitstream) for the associated HEVC slice data.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.3.3 "Picture parameter set RBSP
-    semantics" of the specification.
-
-.. c:type:: v4l2_ctrl_hevc_pps
-
-.. tabularcolumns:: |p{1.2cm}|p{8.6cm}|p{7.5cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_pps
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``pic_parameter_set_id``
-      - Identifies the PPS for reference by other syntax elements.
-    * - __u8
-      - ``num_extra_slice_header_bits``
-      - Specifies the number of extra slice header bits that are present
-        in the slice header RBSP for coded pictures referring to the PPS.
-    * - __u8
-      - ``num_ref_idx_l0_default_active_minus1``
-      - This value plus 1 specifies the inferred value of num_ref_idx_l0_active_minus1.
-    * - __u8
-      - ``num_ref_idx_l1_default_active_minus1``
-      - This value plus 1 specifies the inferred value of num_ref_idx_l1_active_minus1.
-    * - __s8
-      - ``init_qp_minus26``
-      - This value plus 26 specifies the initial value of SliceQp Y for each slice
-        referring to the PPS.
-    * - __u8
-      - ``diff_cu_qp_delta_depth``
-      - Specifies the difference between the luma coding tree block size
-        and the minimum luma coding block size of coding units that
-        convey cu_qp_delta_abs and cu_qp_delta_sign_flag.
-    * - __s8
-      - ``pps_cb_qp_offset``
-      - Specifies the offsets to the luma quantization parameter Cb.
-    * - __s8
-      - ``pps_cr_qp_offset``
-      - Specifies the offsets to the luma quantization parameter Cr.
-    * - __u8
-      - ``num_tile_columns_minus1``
-      - This value plus 1 specifies the number of tile columns partitioning the picture.
-    * - __u8
-      - ``num_tile_rows_minus1``
-      - This value plus 1 specifies the number of tile rows partitioning the picture.
-    * - __u8
-      - ``column_width_minus1[20]``
-      - Plus 1 specifies the width of each tile column in units of
-        coding tree blocks.
-    * - __u8
-      - ``row_height_minus1[22]``
-      - This value plus 1 specifies the height of each tile row in units of coding
-        tree blocks.
-    * - __s8
-      - ``pps_beta_offset_div2``
-      - Specifies the default deblocking parameter offsets for beta divided by 2.
-    * - __s8
-      - ``pps_tc_offset_div2``
-      - Specifies the default deblocking parameter offsets for tC divided by 2.
-    * - __u8
-      - ``log2_parallel_merge_level_minus2``
-      - Plus 2 specifies the value of the variable Log2ParMrgLevel.
-    * - __u8
-      - ``padding[4]``
-      - Applications and drivers must set this to zero.
-    * - __u64
-      - ``flags``
-      - See :ref:`Picture Parameter Set Flags <hevc_pps_flags>`
-
-.. _hevc_pps_flags:
-
-``Picture Parameter Set Flags``
-
-.. raw:: latex
-
-    \small
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED``
-      - 0x00000100
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED``
-      - 0x00000200
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED``
-      - 0x00000400
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_TILES_ENABLED``
-      - 0x00000800
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED``
-      - 0x00001000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED``
-      - 0x00002000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED``
-      - 0x00004000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED``
-      - 0x00008000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER``
-      - 0x00010000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT``
-      - 0x00020000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT``
-      - 0x00040000
-      -
-    * - ``V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
-      - 0x00080000
-      - Specifies the presence of deblocking filter control syntax elements in
-        the PPS
-    * - ``V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING``
-      - 0x00100000
-      - Specifies that tile column boundaries and likewise tile row boundaries
-        are distributed uniformly across the picture
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (struct)``
-    Specifies various slice-specific parameters, especially from the NAL unit
-    header, general slice segment header and weighted prediction parameter
-    parts of the bitstream.
-    These bitstream parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.7 "General slice segment header
-    semantics" of the specification.
-    This control is a dynamically sized 1-dimensional array,
-    V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
-
-.. c:type:: v4l2_ctrl_hevc_slice_params
-
-.. raw:: latex
-
-    \scriptsize
-
-.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_slice_params
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u32
-      - ``bit_size``
-      - Size (in bits) of the current slice data.
-    * - __u32
-      - ``data_byte_offset``
-      - Offset (in bytes) to the video data in the current slice data.
-    * - __u32
-      - ``num_entry_point_offsets``
-      - Specifies the number of entry point offset syntax elements in the slice header.
-        When the driver supports it, the ``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS``
-        must be set.
-    * - __u8
-      - ``nal_unit_type``
-      - Specifies the coding type of the slice (B, P or I).
-    * - __u8
-      - ``nuh_temporal_id_plus1``
-      - This value minus 1 specifies a temporal identifier for the NAL unit.
-    * - __u8
-      - ``slice_type``
-      -
-	(V4L2_HEVC_SLICE_TYPE_I, V4L2_HEVC_SLICE_TYPE_P or
-	V4L2_HEVC_SLICE_TYPE_B).
-    * - __u8
-      - ``colour_plane_id``
-      - Specifies the colour plane associated with the current slice.
-    * - __s32
-      - ``slice_pic_order_cnt``
-      - Specifies the picture order count.
-    * - __u8
-      - ``num_ref_idx_l0_active_minus1``
-      - This value plus 1 specifies the maximum reference index for
-        reference picture list 0 that may be used to decode the slice.
-    * - __u8
-      - ``num_ref_idx_l1_active_minus1``
-      - This value plus 1 specifies the maximum reference index for
-        reference picture list 1 that may be used to decode the slice.
-    * - __u8
-      - ``collocated_ref_idx``
-      - Specifies the reference index of the collocated picture used for
-        temporal motion vector prediction.
-    * - __u8
-      - ``five_minus_max_num_merge_cand``
-      - Specifies the maximum number of merging motion vector prediction
-        candidates supported in the slice subtracted from 5.
-    * - __s8
-      - ``slice_qp_delta``
-      - Specifies the initial value of QpY to be used for the coding blocks in the slice.
-    * - __s8
-      - ``slice_cb_qp_offset``
-      - Specifies a difference to be added to the value of pps_cb_qp_offset.
-    * - __s8
-      - ``slice_cr_qp_offset``
-      - Specifies a difference to be added to the value of pps_cr_qp_offset.
-    * - __s8
-      - ``slice_act_y_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_act_cb_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_act_cr_qp_offset``
-      - Screen content extension parameters.
-    * - __s8
-      - ``slice_beta_offset_div2``
-      - Specifies the deblocking parameter offsets for beta divided by 2.
-    * - __s8
-      - ``slice_tc_offset_div2``
-      - Specifies the deblocking parameter offsets for tC divided by 2.
-    * - __u8
-      - ``pic_struct``
-      - Indicates whether a picture should be displayed as a frame or as one or more fields.
-    * - __u32
-      - ``slice_segment_addr``
-      - Specifies the address of the first coding tree block in the slice segment.
-    * - __u8
-      - ``ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The list of L0 reference elements as indices in the DPB.
-    * - __u8
-      - ``ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The list of L1 reference elements as indices in the DPB.
-    * - __u16
-      - ``short_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the short-term reference picture set, described as st_ref_pic_set()
-        in the specification, included in the slice header or SPS (section 7.3.6.1).
-    * - __u16
-      - ``long_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the long-term reference picture set include in the slice header
-        or SPS. It is the number of bits in the conditional block if(long_term_ref_pics_present_flag)
-        in section 7.3.6.1 of the specification.
-    * - __u8
-      - ``padding``
-      - Applications and drivers must set this to zero.
-    * - struct :c:type:`v4l2_hevc_pred_weight_table`
-      - ``pred_weight_table``
-      - The prediction weight coefficients for inter-picture prediction.
-    * - __u64
-      - ``flags``
-      - See :ref:`Slice Parameters Flags <hevc_slice_params_flags>`
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_slice_params_flags:
-
-``Slice Parameters Flags``
-
-.. raw:: latex
-
-    \scriptsize
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED``
-      - 0x00000004
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO``
-      - 0x00000008
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT``
-      - 0x00000010
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0``
-      - 0x00000020
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV``
-      - 0x00000040
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED``
-      - 0x00000080
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED``
-      - 0x00000100
-      -
-    * - ``V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT``
-      - 0x00000200
-      -
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (integer)``
-    Specifies entry point offsets in bytes.
-    This control is a dynamically sized array. The number of entry point
-    offsets is reported by the ``elems`` field.
-    This bitstream parameter is defined according to :ref:`hevc`.
-    They are described in section 7.4.7.1 "General slice segment header
-    semantics" of the specification.
-    When multiple slices are submitted in a request, the length of
-    this array must be the sum of num_entry_point_offsets of all the
-    slices in the request.
-
-``V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (struct)``
-    Specifies the HEVC scaling matrix parameters used for the scaling process
-    for transform coefficients.
-    These matrix and parameters are defined according to :ref:`hevc`.
-    They are described in section 7.4.5 "Scaling list data semantics" of
-    the specification.
-
-.. c:type:: v4l2_ctrl_hevc_scaling_matrix
-
-.. raw:: latex
-
-    \scriptsize
-
-.. tabularcolumns:: |p{5.4cm}|p{6.8cm}|p{5.1cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_scaling_matrix
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``scaling_list_4x4[6][16]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_8x8[6][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_16x16[6][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_32x32[2][64]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_dc_coef_16x16[6]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-    * - __u8
-      - ``scaling_list_dc_coef_32x32[2]``
-      - Scaling list is used for the scaling process for transform
-        coefficients. The values on each scaling list are expected
-        in raster scan order.
-
-.. raw:: latex
-
-    \normalsize
-
-.. c:type:: v4l2_hevc_dpb_entry
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{1.0cm}|p{4.2cm}|p{12.1cm}|
-
-.. flat-table:: struct v4l2_hevc_dpb_entry
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u64
-      - ``timestamp``
-      - Timestamp of the V4L2 capture buffer to use as reference, used
-        with B-coded and P-coded frames. The timestamp refers to the
-	``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
-	:c:func:`v4l2_timeval_to_ns()` function to convert the struct
-	:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
-    * - __u8
-      - ``flags``
-      - Long term flag for the reference frame
-        (V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE). The flag is set as
-        described in the ITU HEVC specification chapter "8.3.2 Decoding
-        process for reference picture set".
-    * - __u8
-      - ``field_pic``
-      - Whether the reference is a field picture or a frame.
-        See :ref:`HEVC dpb field pic Flags <hevc_dpb_field_pic_flags>`
-    * - __s32
-      - ``pic_order_cnt_val``
-      - The picture order count of the current picture.
-    * - __u8
-      - ``padding[2]``
-      - Applications and drivers must set this to zero.
-
-.. raw:: latex
-
-    \normalsize
-
-.. _hevc_dpb_field_pic_flags:
-
-``HEVC dpb field pic Flags``
-
-.. raw:: latex
-
-    \scriptsize
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME``
-      - 0
-      - (progressive) Frame
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD``
-      - 1
-      - Top field
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD``
-      - 2
-      - Bottom field
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM``
-      - 3
-      - Top field, bottom field, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP``
-      - 4
-      - Bottom field, top field, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP``
-      - 5
-      - Top field, bottom field, top field repeated, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM``
-      - 6
-      - Bottom field, top field, bottom field repeated, in that order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING``
-      - 7
-      - Frame doubling
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING``
-      - 8
-      - Frame tripling
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM``
-      - 9
-      - Top field paired with previous bottom field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP``
-      - 10
-      - Bottom field paired with previous top field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM``
-      - 11
-      - Top field paired with next bottom field in output order
-    * - ``V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP``
-      - 12
-      - Bottom field paired with next top field in output order
-
-.. c:type:: v4l2_hevc_pred_weight_table
-
-.. raw:: latex
-
-    \footnotesize
-
-.. tabularcolumns:: |p{0.8cm}|p{10.6cm}|p{5.9cm}|
-
-.. flat-table:: struct v4l2_hevc_pred_weight_table
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s8
-      - ``delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The difference of the weighting factor applied to the luma
-        prediction value for list 0.
-    * - __s8
-      - ``luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The additive offset applied to the luma prediction value for list 0.
-    * - __s8
-      - ``delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the weighting factor applied to the chroma
-        prediction value for list 0.
-    * - __s8
-      - ``chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the additive offset applied to the chroma
-        prediction values for list 0.
-    * - __s8
-      - ``delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The difference of the weighting factor applied to the luma
-        prediction value for list 1.
-    * - __s8
-      - ``luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The additive offset applied to the luma prediction value for list 1.
-    * - __s8
-      - ``delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the weighting factor applied to the chroma
-        prediction value for list 1.
-    * - __s8
-      - ``chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]``
-      - The difference of the additive offset applied to the chroma
-        prediction values for list 1.
-    * - __u8
-      - ``luma_log2_weight_denom``
-      - The base 2 logarithm of the denominator for all luma weighting
-        factors.
-    * - __s8
-      - ``delta_chroma_log2_weight_denom``
-      - The difference of the base 2 logarithm of the denominator for
-        all chroma weighting factors.
-    * - __u8
-      - ``padding[6]``
-      - Applications and drivers must set this to zero.
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_DECODE_MODE (enum)``
-    Specifies the decoding mode to use. Currently exposes slice-based and
-    frame-based decoding but new modes might be added later on.
-    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
-    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
-    are required to set this control in order to specify the decoding mode
-    that is expected for the buffer.
-    Drivers may expose a single or multiple decoding modes, depending
-    on what they can support.
-
-    .. note::
-
-       This menu control is not yet part of the public kernel API and
-       it is expected to change.
-
-.. c:type:: v4l2_stateless_hevc_decode_mode
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{9.4cm}|p{0.6cm}|p{7.3cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED``
-      - 0
-      - Decoding is done at the slice granularity.
-        The OUTPUT buffer must contain a single slice.
-    * - ``V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED``
-      - 1
-      - Decoding is done at the frame granularity.
-        The OUTPUT buffer must contain all slices needed to decode the
-        frame. The OUTPUT buffer must also contain both fields.
-
-.. raw:: latex
-
-    \normalsize
-
-``V4L2_CID_STATELESS_HEVC_START_CODE (enum)``
-    Specifies the HEVC slice start code expected for each slice.
-    This control is used as a modifier for V4L2_PIX_FMT_HEVC_SLICE
-    pixel format. Applications that support V4L2_PIX_FMT_HEVC_SLICE
-    are required to set this control in order to specify the start code
-    that is expected for the buffer.
-    Drivers may expose a single or multiple start codes, depending
-    on what they can support.
-
-    .. note::
-
-       This menu control is not yet part of the public kernel API and
-       it is expected to change.
-
-.. c:type:: v4l2_stateless_hevc_start_code
-
-.. tabularcolumns:: |p{9.2cm}|p{0.6cm}|p{7.5cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_STATELESS_HEVC_START_CODE_NONE``
-      - 0
-      - Selecting this value specifies that HEVC slices are passed
-        to the driver without any start code. The bitstream data should be
-        according to :ref:`hevc` 7.3.1.1 General NAL unit syntax, hence
-        contains emulation prevention bytes when required.
-    * - ``V4L2_STATELESS_HEVC_START_CODE_ANNEX_B``
-      - 1
-      - Selecting this value specifies that HEVC slices are expected
-        to be prefixed by Annex B start codes. According to :ref:`hevc`
-        valid start codes can be 3-bytes 0x000001 or 4-bytes 0x00000001.
-
-``V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID (integer)``
-    Specifies a priority identifier for the NAL unit, which will be applied to
-    the base layer. By default this value is set to 0 for the base layer,
-    and the next layer will have the priority ID assigned as 1, 2, 3 and so on.
-    The video encoder can't decide the priority id to be applied to a layer,
-    so this has to come from client.
-    This is applicable to H264 and valid Range is from 0 to 63.
-    Source Rec. ITU-T H.264 (06/2019); G.7.4.1.1, G.8.8.1.
-
-``V4L2_CID_MPEG_VIDEO_LTR_COUNT (integer)``
-    Specifies the maximum number of Long Term Reference (LTR) frames at any
-    given time that the encoder can keep.
-    This is applicable to the H264 and HEVC encoders.
-
-``V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX (integer)``
-    After setting this control the frame that will be queued next
-    will be marked as a Long Term Reference (LTR) frame
-    and given this LTR index which ranges from 0 to LTR_COUNT-1.
-    This is applicable to the H264 and HEVC encoders.
-    Source Rec. ITU-T H.264 (06/2019); Table 7.9
-
-``V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES (bitmask)``
-    Specifies the Long Term Reference (LTR) frame(s) to be used for
-    encoding the next frame queued after setting this control.
-    This provides a bitmask which consists of bits [0, LTR_COUNT-1].
-    This is applicable to the H264 and HEVC encoders.
-
-``V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (struct)``
-    Specifies various decode parameters, especially the references picture order
-    count (POC) for all the lists (short, long, before, current, after) and the
-    number of entries for each of them.
-    These parameters are defined according to :ref:`hevc`.
-    They are described in section 8.3 "Slice decoding process" of the
-    specification.
-
-.. c:type:: v4l2_ctrl_hevc_decode_params
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_hevc_decode_params
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s32
-      - ``pic_order_cnt_val``
-      - PicOrderCntVal as described in section 8.3.1 "Decoding process
-        for picture order count" of the specification.
-    * - __u16
-      - ``short_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the short-term reference picture set, of the first slice
-        described as st_ref_pic_set() in the specification, included in the slice header
-        or SPS (section 7.3.6.1).
-    * - __u16
-      - ``long_term_ref_pic_set_size``
-      - Specifies the size, in bits, of the long-term reference picture set, of the first slice
-        included in the slice header or SPS. It is the number of bits in the conditional block
-        if(long_term_ref_pics_present_flag) in section 7.3.6.1 of the specification.
-    * - __u8
-      - ``num_active_dpb_entries``
-      - The number of entries in ``dpb``.
-    * - __u8
-      - ``num_poc_st_curr_before``
-      - The number of reference pictures in the short-term set that come before
-        the current frame.
-    * - __u8
-      - ``num_poc_st_curr_after``
-      - The number of reference pictures in the short-term set that come after
-        the current frame.
-    * - __u8
-      - ``num_poc_lt_curr``
-      - The number of reference pictures in the long-term set.
-    * - __u8
-      - ``poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocStCurrBefore as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the short term before references in DPB array.
-    * - __u8
-      - ``poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocStCurrAfter as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the short term after references in DPB array.
-    * - __u8
-      - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - PocLtCurr as described in section 8.3.2 "Decoding process for reference
-        picture set": provides the index of the long term references in DPB array.
-    * - struct :c:type:`v4l2_hevc_dpb_entry`
-      - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]``
-      - The decoded picture buffer, for meta-data about reference frames.
-    * - __u64
-      - ``flags``
-      - See :ref:`Decode Parameters Flags <hevc_decode_params_flags>`
-
-.. _hevc_decode_params_flags:
-
-``Decode Parameters Flags``
-
-.. cssclass:: longtable
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC``
-      - 0x00000001
-      -
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC``
-      - 0x00000002
-      -
-    * - ``V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR``
-      - 0x00000004
-      -
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index c352d91a73d8..506dd3c98884 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -212,7 +212,7 @@ Compressed Formats
         ``V4L2_CID_MPEG_VIDEO_HEVC_SPS``,
         ``V4L2_CID_MPEG_VIDEO_HEVC_PPS``, and
         ``V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS``.
-	See the :ref:`associated Codec Control IDs <v4l2-mpeg-hevc>`.
+	See the :ref:`associated Codec Control IDs <v4l2-codec-stateless-hevc>`.
 	Buffers associated with this pixel format must contain the appropriate
 	number of macroblocks to decode a full corresponding frame.
     * .. _V4L2-PIX-FMT-FWHT:
diff --git a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
index 29971a45a2d4..892cfeb8b988 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
@@ -249,6 +249,26 @@ still cause this situation.
       - ``p_hdr10_mastering``
       - A pointer to a struct :c:type:`v4l2_ctrl_hdr10_mastering_display`. Valid if this control is
         of type ``V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY``.
+    * - struct :c:type:`v4l2_ctrl_hevc_sps` *
+      - ``p_hevc_sps``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_sps`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SPS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_pps` *
+      - ``p_hevc_pps``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_pps`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_PPS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_slice_params` *
+      - ``p_hevc_slice_params``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_slice_params`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS``.
+    * - struct :c:type:`v4l2_ctrl_hevc_scaling_matrix` *
+      - ``p_hevc_scaling_matrix``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_scaling_matrix`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX``.
+    * - struct :c:type:`v4l2_ctrl_hevc_decode_params` *
+      - ``p_hevc_decode_params``
+      - A pointer to a struct :c:type:`v4l2_ctrl_hevc_decode_params`. Valid if this
+        control is of type ``V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS``.
     * - void *
       - ``ptr``
       - A pointer to a compound type which can be an N-dimensional array
diff --git a/include/media/hevc-ctrls.h b/include/media/hevc-ctrls.h
deleted file mode 100644
index c89029b3c5da..000000000000
--- a/include/media/hevc-ctrls.h
+++ /dev/null
@@ -1,474 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the HEVC state controls for use with stateless HEVC
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _HEVC_CTRLS_H_
-#define _HEVC_CTRLS_H_
-
-#include <linux/videodev2.h>
-
-#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_BASE + 1017)
-
-enum v4l2_stateless_hevc_decode_mode {
-	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
-	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
-};
-
-enum v4l2_stateless_hevc_start_code {
-	V4L2_STATELESS_HEVC_START_CODE_NONE,
-	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
-};
-
-#define V4L2_HEVC_SLICE_TYPE_B	0
-#define V4L2_HEVC_SLICE_TYPE_P	1
-#define V4L2_HEVC_SLICE_TYPE_I	2
-
-#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-
-/**
- * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
- *
- * @video_parameter_set_id: specifies the value of the
- *			    vps_video_parameter_set_id of the active VPS
- * @seq_parameter_set_id: provides an identifier for the SPS for
- *			  reference by other syntax elements
- * @pic_width_in_luma_samples: specifies the width of each decoded picture
- *			       in units of luma samples
- * @pic_height_in_luma_samples: specifies the height of each decoded picture
- *				in units of luma samples
- * @bit_depth_luma_minus8: this value plus 8 specifies the bit depth of the
- *                         samples of the luma array
- * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
- *                           samples of the chroma arrays
- * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value
- *                                     of the variable MaxPicOrderCntLsb
- * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
- *                                    required size of the decoded picture
- *                                    buffer for the codec video sequence
- * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
- * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
- *				    value of SpsMaxLatencyPictures array
- * @log2_min_luma_coding_block_size_minus3: this value plus 3 specifies the
- *                                          minimum luma coding block size
- * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
- *					      the maximum and minimum luma
- *					      coding block size
- * @log2_min_luma_transform_block_size_minus2: this value plus 2 specifies the
- *                                             minimum luma transform block size
- * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
- *						 the maximum and minimum luma
- *						 transform block size
- * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
- *					 depth for transform units of
- *					 coding units coded in inter
- *					 prediction mode
- * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
- *					 depth for transform units of
- *					 coding units coded in intra
- *					 prediction mode
- * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
- *                                    bits used to represent each of PCM sample
- *                                    values of the luma component
- * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
- *                                      of bits used to represent each of PCM
- *                                      sample values of the chroma components
- * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
- *                                              minimum size of coding blocks
- * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
- *						  the maximum and minimum size of
- *						  coding blocks
- * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
- *				 syntax structures included in the SPS
- * @num_long_term_ref_pics_sps:	specifies the number of candidate long-term
- *				reference pictures that are specified in the SPS
- * @chroma_format_idc: specifies the chroma sampling
- * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
- *                             of temporal sub-layers
- * @reserved: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_SPS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_sps {
-	__u8	video_parameter_set_id;
-	__u8	seq_parameter_set_id;
-	__u16	pic_width_in_luma_samples;
-	__u16	pic_height_in_luma_samples;
-	__u8	bit_depth_luma_minus8;
-	__u8	bit_depth_chroma_minus8;
-	__u8	log2_max_pic_order_cnt_lsb_minus4;
-	__u8	sps_max_dec_pic_buffering_minus1;
-	__u8	sps_max_num_reorder_pics;
-	__u8	sps_max_latency_increase_plus1;
-	__u8	log2_min_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_luma_coding_block_size;
-	__u8	log2_min_luma_transform_block_size_minus2;
-	__u8	log2_diff_max_min_luma_transform_block_size;
-	__u8	max_transform_hierarchy_depth_inter;
-	__u8	max_transform_hierarchy_depth_intra;
-	__u8	pcm_sample_bit_depth_luma_minus1;
-	__u8	pcm_sample_bit_depth_chroma_minus1;
-	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-	__u8	num_short_term_ref_pic_sets;
-	__u8	num_long_term_ref_pics_sps;
-	__u8	chroma_format_idc;
-	__u8	sps_max_sub_layers_minus1;
-
-	__u8	reserved[6];
-	__u64	flags;
-};
-
-#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-
-/**
- * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
- *
- * @pic_parameter_set_id: identifies the PPS for reference by other
- *			  syntax elements
- * @num_extra_slice_header_bits: specifies the number of extra slice header
- *				 bits that are present in the slice header RBSP
- *				 for coded pictures referring to the PPS.
- * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the inferred
- *                                        value of num_ref_idx_l0_active_minus1
- * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the inferred
- *                                        value of num_ref_idx_l1_active_minus1
- * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y
- *                   for each slice referring to the PPS
- * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
- *			    tree block size and the minimum luma coding block
- *			    size of coding units that convey cu_qp_delta_abs
- *			    and cu_qp_delta_sign_flag
- * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
- * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
- * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
- *			     partitioning the picture
- * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows
- *                        partitioning the picture
- * @column_width_minus1: this value plus 1 specifies the width of each tile column
- *                       in units of coding tree blocks
- * @row_height_minus1: this value plus 1 specifies the height of each tile row in
- *		       units of coding tree blocks
- * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
- *			  beta divided by 2
- * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
- *			divided by 2
- * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
- *                                    the variable Log2ParMrgLevel
- * @reserved: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_PPS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_pps {
-	__u8	pic_parameter_set_id;
-	__u8	num_extra_slice_header_bits;
-	__u8	num_ref_idx_l0_default_active_minus1;
-	__u8	num_ref_idx_l1_default_active_minus1;
-	__s8	init_qp_minus26;
-	__u8	diff_cu_qp_delta_depth;
-	__s8	pps_cb_qp_offset;
-	__s8	pps_cr_qp_offset;
-	__u8	num_tile_columns_minus1;
-	__u8	num_tile_rows_minus1;
-	__u8	column_width_minus1[20];
-	__u8	row_height_minus1[22];
-	__s8	pps_beta_offset_div2;
-	__s8	pps_tc_offset_div2;
-	__u8	log2_parallel_merge_level_minus2;
-	__u8	reserved;
-	__u64	flags;
-};
-
-#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
-
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
-#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
-#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
-#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
-
-#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-
-/**
- * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
- *
- * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
- * @flags: long term flag for the reference frame
- * @field_pic: whether the reference is a field picture or a frame.
- * @reserved: padding field. Should be zeroed by applications.
- * @pic_order_cnt_val: the picture order count of the reference.
- */
-struct v4l2_hevc_dpb_entry {
-	__u64	timestamp;
-	__u8	flags;
-	__u8	field_pic;
-	__u16	reserved;
-	__s32	pic_order_cnt_val;
-};
-
-/**
- * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
- *
- * @delta_luma_weight_l0: the difference of the weighting factor applied
- *			  to the luma prediction value for list 0
- * @luma_offset_l0: the additive offset applied to the luma prediction value
- *		    for list 0
- * @delta_chroma_weight_l0: the difference of the weighting factor applied
- *			    to the chroma prediction values for list 0
- * @chroma_offset_l0: the difference of the additive offset applied to
- *		      the chroma prediction values for list 0
- * @delta_luma_weight_l1: the difference of the weighting factor applied
- *			  to the luma prediction value for list 1
- * @luma_offset_l1: the additive offset applied to the luma prediction value
- *		    for list 1
- * @delta_chroma_weight_l1: the difference of the weighting factor applied
- *			    to the chroma prediction values for list 1
- * @chroma_offset_l1: the difference of the additive offset applied to
- *		      the chroma prediction values for list 1
- * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
- *			    all luma weighting factors
- * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
- *				    of the denominator for all chroma
- *				    weighting factors
- */
-struct v4l2_hevc_pred_weight_table {
-	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-
-	__u8	luma_log2_weight_denom;
-	__s8	delta_chroma_log2_weight_denom;
-};
-
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-
-/**
- * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
- *
- * This control is a dynamically sized 1-dimensional array,
- * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
- *
- * @bit_size: size (in bits) of the current slice data
- * @data_byte_offset: offset (in bytes) to the video data in the current slice data
- * @num_entry_point_offsets: specifies the number of entry point offset syntax
- *			     elements in the slice header.
- * @nal_unit_type: specifies the coding type of the slice (B, P or I)
- * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
- * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
- * @colour_plane_id: specifies the colour plane associated with the current slice
- * @slice_pic_order_cnt: specifies the picture order count
- * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum reference
- *                                index for reference picture list 0 that may be
- *                                used to decode the slice
- * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum reference
- *                                index for reference picture list 1 that may be
- *                                used to decode the slice
- * @collocated_ref_idx: specifies the reference index of the collocated picture used
- *			for temporal motion vector prediction
- * @five_minus_max_num_merge_cand: specifies the maximum number of merging
- *				   motion vector prediction candidates supported in
- *				   the slice subtracted from 5
- * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
- *		    blocks in the slice
- * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
- * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
- * @slice_act_y_qp_offset: screen content extension parameters
- * @slice_act_cb_qp_offset: screen content extension parameters
- * @slice_act_cr_qp_offset: screen content extension parameters
- * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
- * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
- * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
- *		more fields
- * @reserved0: padding field. Should be zeroed by applications.
- * @slice_segment_addr: specifies the address of the first coding tree block in
- *			the slice segment
- * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
- * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
- * @short_term_ref_pic_set_size: specifies the size of short-term reference
- *				 pictures included in the SPS
- * @long_term_ref_pic_set_size: specifies the size of long-term reference
- *				picture include in the SPS
- * @pred_weight_table: the prediction weight coefficients for inter-picture
- *		       prediction
- * @reserved1: padding field. Should be zeroed by applications.
- * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
- */
-struct v4l2_ctrl_hevc_slice_params {
-	__u32	bit_size;
-	__u32	data_byte_offset;
-	__u32	num_entry_point_offsets;
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-	__u8	nal_unit_type;
-	__u8	nuh_temporal_id_plus1;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	__u8	slice_type;
-	__u8	colour_plane_id;
-	__s32	slice_pic_order_cnt;
-	__u8	num_ref_idx_l0_active_minus1;
-	__u8	num_ref_idx_l1_active_minus1;
-	__u8	collocated_ref_idx;
-	__u8	five_minus_max_num_merge_cand;
-	__s8	slice_qp_delta;
-	__s8	slice_cb_qp_offset;
-	__s8	slice_cr_qp_offset;
-	__s8	slice_act_y_qp_offset;
-	__s8	slice_act_cb_qp_offset;
-	__s8	slice_act_cr_qp_offset;
-	__s8	slice_beta_offset_div2;
-	__s8	slice_tc_offset_div2;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-	__u8	pic_struct;
-
-	__u8	reserved0[3];
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-	__u32	slice_segment_addr;
-	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u16	short_term_ref_pic_set_size;
-	__u16	long_term_ref_pic_set_size;
-
-	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-	struct v4l2_hevc_pred_weight_table pred_weight_table;
-
-	__u8	reserved1[2];
-	__u64	flags;
-};
-
-#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-
-/**
- * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
- *
- * @pic_order_cnt_val: picture order count
- * @short_term_ref_pic_set_size: specifies the size of short-term reference
- *				 pictures set included in the SPS of the first slice
- * @long_term_ref_pic_set_size: specifies the size of long-term reference
- *				pictures set include in the SPS of the first slice
- * @num_active_dpb_entries: the number of entries in dpb
- * @num_poc_st_curr_before: the number of reference pictures in the short-term
- *			    set that come before the current frame
- * @num_poc_st_curr_after: the number of reference pictures in the short-term
- *			   set that come after the current frame
- * @num_poc_lt_curr: the number of reference pictures in the long-term set
- * @poc_st_curr_before: provides the index of the short term before references
- *			in DPB array
- * @poc_st_curr_after: provides the index of the short term after references
- *		       in DPB array
- * @poc_lt_curr: provides the index of the long term references in DPB array
- * @reserved: padding field. Should be zeroed by applications.
- * @dpb: the decoded picture buffer, for meta-data about reference frames
- * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
- */
-struct v4l2_ctrl_hevc_decode_params {
-	__s32	pic_order_cnt_val;
-	__u16	short_term_ref_pic_set_size;
-	__u16	long_term_ref_pic_set_size;
-	__u8	num_active_dpb_entries;
-	__u8	num_poc_st_curr_before;
-	__u8	num_poc_st_curr_after;
-	__u8	num_poc_lt_curr;
-	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u8	reserved[4];
-	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-	__u64	flags;
-};
-
-/**
- * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
- *
- * @scaling_list_4x4: scaling list is used for the scaling process for
- *		      transform coefficients. The values on each scaling
- *		      list are expected in raster scan order
- * @scaling_list_8x8: scaling list is used for the scaling process for
- *		      transform coefficients. The values on each scaling
- *		      list are expected in raster scan order
- * @scaling_list_16x16: scaling list is used for the scaling process for
- *			transform coefficients. The values on each scaling
- *			list are expected in raster scan order
- * @scaling_list_32x32:	scaling list is used for the scaling process for
- *			transform coefficients. The values on each scaling
- *			list are expected in raster scan order
- * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process
- *				for transform coefficients. The values on each
- *				scaling list are expected in raster scan order.
- * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
- *				for transform coefficients. The values on each
- *				scaling list are expected in raster scan order.
- */
-struct v4l2_ctrl_hevc_scaling_matrix {
-	__u8	scaling_list_4x4[6][16];
-	__u8	scaling_list_8x8[6][64];
-	__u8	scaling_list_16x16[6][64];
-	__u8	scaling_list_32x32[2][64];
-	__u8	scaling_list_dc_coef_16x16[6];
-	__u8	scaling_list_dc_coef_32x32[2];
-};
-
-#endif
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index f4105de8a8d2..00828a4f9404 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -13,12 +13,6 @@
 #include <linux/videodev2.h>
 #include <media/media-request.h>
 
-/*
- * Include the stateless codec compound control definitions.
- * This will move to the public headers once this API is fully stable.
- */
-#include <media/hevc-ctrls.h>
-
 /* forward references */
 struct file;
 struct poll_table_struct;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index dfff69ed88f7..5f46bf4a570c 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1997,6 +1997,465 @@ struct v4l2_ctrl_mpeg2_quantisation {
 	__u8	chroma_non_intra_quantiser_matrix[64];
 };
 
+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+
+enum v4l2_stateless_hevc_decode_mode {
+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
+};
+
+enum v4l2_stateless_hevc_start_code {
+	V4L2_STATELESS_HEVC_START_CODE_NONE,
+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+};
+
+#define V4L2_HEVC_SLICE_TYPE_B	0
+#define V4L2_HEVC_SLICE_TYPE_P	1
+#define V4L2_HEVC_SLICE_TYPE_I	2
+
+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
+
+/**
+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
+ *
+ * @video_parameter_set_id: specifies the value of the
+ *			vps_video_parameter_set_id of the active VPS
+ * @seq_parameter_set_id: provides an identifier for the SPS for
+ *			  reference by other syntax elements
+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
+ *				in units of luma samples
+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
+ *				in units of luma samples
+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
+ *                         samples of the luma array
+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
+ *                           samples of the chroma arrays
+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
+ *                                     the variable MaxPicOrderCntLsb
+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
+ *                                    required size of the decoded picture
+ *                                    buffer for the codec video sequence
+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
+ *				    value of SpsMaxLatencyPictures array
+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
+ *					    luma coding block size
+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
+ *					      the maximum and minimum luma
+ *					      coding block size
+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
+ *					       transform block size
+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
+ *						 the maximum and minimum luma
+ *						 transform block size
+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in inter
+ *					 prediction mode
+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
+ *					 depth for transform units of
+ *					 coding units coded in intra
+ *					 prediction mode
+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
+ *                                    bits used to represent each of PCM sample
+ *                                    values of the luma component
+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
+ *                                      of bits used to represent each of PCM
+ *                                      sample values of the chroma components
+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
+ *                                              minimum size of coding blocks
+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
+ *						  the maximum and minimum size of
+ *						  coding blocks
+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
+ *				 syntax structures included in the SPS
+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
+ *				reference pictures that are specified in the SPS
+ * @chroma_format_idc: specifies the chroma sampling
+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
+ *                             of temporal sub-layers
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_sps {
+	__u8	video_parameter_set_id;
+	__u8	seq_parameter_set_id;
+	__u16	pic_width_in_luma_samples;
+	__u16	pic_height_in_luma_samples;
+	__u8	bit_depth_luma_minus8;
+	__u8	bit_depth_chroma_minus8;
+	__u8	log2_max_pic_order_cnt_lsb_minus4;
+	__u8	sps_max_dec_pic_buffering_minus1;
+	__u8	sps_max_num_reorder_pics;
+	__u8	sps_max_latency_increase_plus1;
+	__u8	log2_min_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_luma_coding_block_size;
+	__u8	log2_min_luma_transform_block_size_minus2;
+	__u8	log2_diff_max_min_luma_transform_block_size;
+	__u8	max_transform_hierarchy_depth_inter;
+	__u8	max_transform_hierarchy_depth_intra;
+	__u8	pcm_sample_bit_depth_luma_minus1;
+	__u8	pcm_sample_bit_depth_chroma_minus1;
+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
+	__u8	num_short_term_ref_pic_sets;
+	__u8	num_long_term_ref_pics_sps;
+	__u8	chroma_format_idc;
+	__u8	sps_max_sub_layers_minus1;
+
+	__u8	reserved[6];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
+
+/**
+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
+ *
+ * @pic_parameter_set_id: identifies the PPS for reference by other
+ *			  syntax elements
+ * @num_extra_slice_header_bits: specifies the number of extra slice header
+ *				 bits that are present in the slice header RBSP
+ *				 for coded pictures referring to the PPS.
+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l0_active_minus1
+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
+ *                                        inferred value of num_ref_idx_l1_active_minus1
+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
+ *		     each slice referring to the PPS
+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
+ *			    tree block size and the minimum luma coding block
+ *			    size of coding units that convey cu_qp_delta_abs
+ *			    and cu_qp_delta_sign_flag
+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
+ *			     partitioning the picture
+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
+ *			  the picture
+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
+ *			 units of coding tree blocks
+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
+ *		       units of coding tree blocks
+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
+ *			  beta divided by 2
+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
+ *			divided by 2
+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
+ *                                    the variable Log2ParMrgLevel
+ * @reserved: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_pps {
+	__u8	pic_parameter_set_id;
+	__u8	num_extra_slice_header_bits;
+	__u8	num_ref_idx_l0_default_active_minus1;
+	__u8	num_ref_idx_l1_default_active_minus1;
+	__s8	init_qp_minus26;
+	__u8	diff_cu_qp_delta_depth;
+	__s8	pps_cb_qp_offset;
+	__s8	pps_cr_qp_offset;
+	__u8	num_tile_columns_minus1;
+	__u8	num_tile_rows_minus1;
+	__u8	column_width_minus1[20];
+	__u8	row_height_minus1[22];
+	__s8	pps_beta_offset_div2;
+	__s8	pps_tc_offset_div2;
+	__u8	log2_parallel_merge_level_minus2;
+	__u8	reserved;
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
+
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
+
+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
+
+/**
+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
+ *
+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
+ * @flags: long term flag for the reference frame
+ * @field_pic: whether the reference is a field picture or a frame.
+ * @reserved: padding field. Should be zeroed by applications.
+ * @pic_order_cnt_val: the picture order count of the current picture.
+ */
+struct v4l2_hevc_dpb_entry {
+	__u64	timestamp;
+	__u8	flags;
+	__u8	field_pic;
+	__u16	reserved;
+	__s32	pic_order_cnt_val;
+};
+
+/**
+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
+ *
+ * @delta_luma_weight_l0: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 0
+ * @luma_offset_l0: the additive offset applied to the luma prediction value
+ *		    for list 0
+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 0
+ * @chroma_offset_l0: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 0
+ * @delta_luma_weight_l1: the difference of the weighting factor applied
+ *			  to the luma prediction value for list 1
+ * @luma_offset_l1: the additive offset applied to the luma prediction value
+ *		    for list 1
+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
+ *			    to the chroma prediction values for list 1
+ * @chroma_offset_l1: the difference of the additive offset applied to
+ *		      the chroma prediction values for list 1
+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
+ *			    all luma weighting factors
+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
+ *				    of the denominator for all chroma
+ *				    weighting factors
+ */
+struct v4l2_hevc_pred_weight_table {
+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
+
+	__u8	luma_log2_weight_denom;
+	__s8	delta_chroma_log2_weight_denom;
+};
+
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
+
+/**
+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
+ *
+ * This control is a dynamically sized 1-dimensional array,
+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
+ *
+ * @bit_size: size (in bits) of the current slice data
+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
+ *			     elements in the slice header.
+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
+ * @colour_plane_id: specifies the colour plane associated with the current slice
+ * @slice_pic_order_cnt: specifies the picture order count
+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 0
+ *                                that may be used to decode the slice
+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
+ *                                reference index for reference picture list 1
+ *                                that may be used to decode the slice
+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
+ *			for temporal motion vector prediction
+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
+ *				   motion vector prediction candidates supported in
+ *				   the slice subtracted from 5
+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
+ *		    blocks in the slice
+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
+ * @slice_act_y_qp_offset: screen content extension parameters
+ * @slice_act_cb_qp_offset: screen content extension parameters
+ * @slice_act_cr_qp_offset: screen content extension parameters
+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
+ *		more fields
+ * @reserved0: padding field. Should be zeroed by applications.
+ * @slice_segment_addr: specifies the address of the first coding tree block in
+ *			the slice segment
+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS
+ * @pred_weight_table: the prediction weight coefficients for inter-picture
+ *		       prediction
+ * @reserved1: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_slice_params {
+	__u32	bit_size;
+	__u32	data_byte_offset;
+	__u32	num_entry_point_offsets;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+	__u8	nal_unit_type;
+	__u8	nuh_temporal_id_plus1;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u8	slice_type;
+	__u8	colour_plane_id;
+	__s32	slice_pic_order_cnt;
+	__u8	num_ref_idx_l0_active_minus1;
+	__u8	num_ref_idx_l1_active_minus1;
+	__u8	collocated_ref_idx;
+	__u8	five_minus_max_num_merge_cand;
+	__s8	slice_qp_delta;
+	__s8	slice_cb_qp_offset;
+	__s8	slice_cr_qp_offset;
+	__s8	slice_act_y_qp_offset;
+	__s8	slice_act_cb_qp_offset;
+	__s8	slice_act_cr_qp_offset;
+	__s8	slice_beta_offset_div2;
+	__s8	slice_tc_offset_div2;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
+	__u8	pic_struct;
+
+	__u8	reserved0[3];
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+	__u32	slice_segment_addr;
+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+
+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
+	struct v4l2_hevc_pred_weight_table pred_weight_table;
+
+	__u8	reserved1[2];
+	__u64	flags;
+};
+
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
+
+/**
+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
+ *
+ * @pic_order_cnt_val: picture order count
+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
+ *				 pictures set included in the SPS of the first slice
+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
+ *				pictures set include in the SPS of the first slice
+ * @num_active_dpb_entries: the number of entries in dpb
+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
+ *			    set that come before the current frame
+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
+ *			   set that come after the current frame
+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
+ * @poc_st_curr_before: provides the index of the short term before references
+ *			in DPB array
+ * @poc_st_curr_after: provides the index of the short term after references
+ *		       in DPB array
+ * @poc_lt_curr: provides the index of the long term references in DPB array
+ * @reserved: padding field. Should be zeroed by applications.
+ * @dpb: the decoded picture buffer, for meta-data about reference frames
+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_decode_params {
+	__s32	pic_order_cnt_val;
+	__u16	short_term_ref_pic_set_size;
+	__u16	long_term_ref_pic_set_size;
+	__u8	num_active_dpb_entries;
+	__u8	num_poc_st_curr_before;
+	__u8	num_poc_st_curr_after;
+	__u8	num_poc_lt_curr;
+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u8	reserved[4];
+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+	__u64	flags;
+};
+
+/**
+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
+ *
+ * @scaling_list_4x4: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_8x8: scaling list is used for the scaling process for
+ *		      transform coefficients. The values on each scaling
+ *		      list are expected in raster scan order
+ * @scaling_list_16x16:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_32x32:	scaling list is used for the scaling process for
+ *			transform coefficients. The values on each scaling
+ *			list are expected in raster scan order
+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
+ *				for transform coefficients. The values on each
+ *				scaling list are expected in raster scan order.
+ */
+struct v4l2_ctrl_hevc_scaling_matrix {
+	__u8	scaling_list_4x4[6][16];
+	__u8	scaling_list_8x8[6][64];
+	__u8	scaling_list_16x16[6][64];
+	__u8	scaling_list_32x32[2][64];
+	__u8	scaling_list_dc_coef_16x16[6];
+	__u8	scaling_list_dc_coef_32x32[2];
+};
+
 #define V4L2_CID_COLORIMETRY_CLASS_BASE	(V4L2_CTRL_CLASS_COLORIMETRY | 0x900)
 #define V4L2_CID_COLORIMETRY_CLASS	(V4L2_CTRL_CLASS_COLORIMETRY | 1)
 
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index cff2bb78b2cc..d6fac2344033 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1793,6 +1793,11 @@ struct v4l2_ext_control {
 		struct v4l2_ctrl_mpeg2_quantisation __user *p_mpeg2_quantisation;
 		struct v4l2_ctrl_vp9_compressed_hdr __user *p_vp9_compressed_hdr_probs;
 		struct v4l2_ctrl_vp9_frame __user *p_vp9_frame;
+		struct v4l2_ctrl_hevc_sps __user *p_hevc_sps;
+		struct v4l2_ctrl_hevc_pps __user *p_hevc_pps;
+		struct v4l2_ctrl_hevc_slice_params __user *p_hevc_slice_params;
+		struct v4l2_ctrl_hevc_scaling_matrix __user *p_hevc_scaling_matrix;
+		struct v4l2_ctrl_hevc_decode_params __user *p_hevc_decode_params;
 		void __user *ptr;
 	};
 } __attribute__ ((packed));
-- 
cgit v1.2.3


From ceefa81e6e69b020997205e5c30a42d43aa5ae63 Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Fri, 15 Jul 2022 16:03:22 +0200
Subject: serial: remove VR41XX serial driver

Commit d3164e2f3b0a ("MIPS: Remove VR41xx support") removed support
for MIPS VR41xx platform, so remove exclusive drivers for this
platform, too.

Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Link: https://lore.kernel.org/r/20220715140322.135825-1-tsbogend@alpha.franken.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |  17 -
 drivers/tty/serial/Makefile      |   1 -
 drivers/tty/serial/vr41xx_siu.c  | 932 ---------------------------------------
 include/uapi/linux/serial_core.h |   4 -
 4 files changed, 954 deletions(-)
 delete mode 100644 drivers/tty/serial/vr41xx_siu.c

(limited to 'include/uapi/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 8a3ee1525d80..f92963a2226b 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -890,23 +890,6 @@ config SERIAL_TXX9_STDSERIAL
 	bool "TX39XX/49XX SIO act as standard serial"
 	depends on !SERIAL_8250 && SERIAL_TXX9
 
-config SERIAL_VR41XX
-	tristate "NEC VR4100 series Serial Interface Unit support"
-	depends on CPU_VR41XX
-	select SERIAL_CORE
-	help
-	  If you have a NEC VR4100 series processor and you want to use
-	  Serial Interface Unit(SIU) or Debug Serial Interface Unit(DSIU)
-	  (not include VR4111/VR4121 DSIU), say Y.  Otherwise, say N.
-
-config SERIAL_VR41XX_CONSOLE
-	bool "Enable NEC VR4100 series Serial Interface Unit console"
-	depends on SERIAL_VR41XX=y
-	select SERIAL_CORE_CONSOLE
-	help
-	  If you have a NEC VR4100 series processor and you want to use
-	  a console on a serial port, say Y.  Otherwise, say N.
-
 config SERIAL_JSM
 	tristate "Digi International NEO and Classic PCI Support"
 	depends on PCI
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 61cc8de95571..238a9557b487 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -51,7 +51,6 @@ obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o
 obj-$(CONFIG_SERIAL_SC16IS7XX_CORE) += sc16is7xx.o
 obj-$(CONFIG_SERIAL_JSM) += jsm/
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
-obj-$(CONFIG_SERIAL_VR41XX) += vr41xx_siu.o
 obj-$(CONFIG_SERIAL_ATMEL) += atmel_serial.o
 obj-$(CONFIG_SERIAL_UARTLITE) += uartlite.o
 obj-$(CONFIG_SERIAL_MSM) += msm_serial.o
diff --git a/drivers/tty/serial/vr41xx_siu.c b/drivers/tty/serial/vr41xx_siu.c
deleted file mode 100644
index 1ba689a81abd..000000000000
--- a/drivers/tty/serial/vr41xx_siu.c
+++ /dev/null
@@ -1,932 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- *  Driver for NEC VR4100 series Serial Interface Unit.
- *
- *  Copyright (C) 2004-2008  Yoichi Yuasa <yuasa@linux-mips.org>
- *
- *  Based on drivers/serial/8250.c, by Russell King.
- */
-
-#include <linux/console.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <linux/serial_reg.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-
-#include <linux/io.h>
-#include <asm/vr41xx/siu.h>
-#include <asm/vr41xx/vr41xx.h>
-
-#define SIU_BAUD_BASE	1152000
-#define SIU_MAJOR	204
-#define SIU_MINOR_BASE	82
-
-#define RX_MAX_COUNT	256
-#define TX_MAX_COUNT	15
-
-#define SIUIRSEL	0x08
- #define TMICMODE	0x20
- #define TMICTX		0x10
- #define IRMSEL		0x0c
- #define IRMSEL_HP	0x08
- #define IRMSEL_TEMIC	0x04
- #define IRMSEL_SHARP	0x00
- #define IRUSESEL	0x02
- #define SIRSEL		0x01
-
-static struct uart_port siu_uart_ports[SIU_PORTS_MAX] = {
-	[0 ... SIU_PORTS_MAX-1] = {
-		.lock	= __SPIN_LOCK_UNLOCKED(siu_uart_ports->lock),
-		.irq	= 0,
-	},
-};
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-static uint8_t lsr_break_flag[SIU_PORTS_MAX];
-#endif
-
-#define siu_read(port, offset)		readb((port)->membase + (offset))
-#define siu_write(port, offset, value)	writeb((value), (port)->membase + (offset))
-
-void vr41xx_select_siu_interface(siu_interface_t interface)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	if (interface == SIU_INTERFACE_IRDA)
-		irsel |= SIRSEL;
-	else
-		irsel &= ~SIRSEL;
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_select_siu_interface);
-
-void vr41xx_use_irda(irda_use_t use)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	if (use == FIR_USE_IRDA)
-		irsel |= IRUSESEL;
-	else
-		irsel &= ~IRUSESEL;
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_use_irda);
-
-void vr41xx_select_irda_module(irda_module_t module, irda_speed_t speed)
-{
-	struct uart_port *port;
-	unsigned long flags;
-	uint8_t irsel;
-
-	port = &siu_uart_ports[0];
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	irsel = siu_read(port, SIUIRSEL);
-	irsel &= ~(IRMSEL | TMICTX | TMICMODE);
-	switch (module) {
-	case SHARP_IRDA:
-		irsel |= IRMSEL_SHARP;
-		break;
-	case TEMIC_IRDA:
-		irsel |= IRMSEL_TEMIC | TMICMODE;
-		if (speed == IRDA_TX_4MBPS)
-			irsel |= TMICTX;
-		break;
-	case HP_IRDA:
-		irsel |= IRMSEL_HP;
-		break;
-	default:
-		break;
-	}
-	siu_write(port, SIUIRSEL, irsel);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-EXPORT_SYMBOL_GPL(vr41xx_select_irda_module);
-
-static inline void siu_clear_fifo(struct uart_port *port)
-{
-	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO);
-	siu_write(port, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR |
-	                          UART_FCR_CLEAR_XMIT);
-	siu_write(port, UART_FCR, 0);
-}
-
-static inline unsigned long siu_port_size(struct uart_port *port)
-{
-	switch (port->type) {
-	case PORT_VR41XX_SIU:
-		return 11UL;
-	case PORT_VR41XX_DSIU:
-		return 8UL;
-	}
-
-	return 0;
-}
-
-static inline unsigned int siu_check_type(struct uart_port *port)
-{
-	if (port->line == 0)
-		return PORT_VR41XX_SIU;
-	if (port->line == 1 && port->irq)
-		return PORT_VR41XX_DSIU;
-
-	return PORT_UNKNOWN;
-}
-
-static inline const char *siu_type_name(struct uart_port *port)
-{
-	switch (port->type) {
-	case PORT_VR41XX_SIU:
-		return "SIU";
-	case PORT_VR41XX_DSIU:
-		return "DSIU";
-	}
-
-	return NULL;
-}
-
-static unsigned int siu_tx_empty(struct uart_port *port)
-{
-	uint8_t lsr;
-
-	lsr = siu_read(port, UART_LSR);
-	if (lsr & UART_LSR_TEMT)
-		return TIOCSER_TEMT;
-
-	return 0;
-}
-
-static void siu_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-	uint8_t mcr = 0;
-
-	if (mctrl & TIOCM_DTR)
-		mcr |= UART_MCR_DTR;
-	if (mctrl & TIOCM_RTS)
-		mcr |= UART_MCR_RTS;
-	if (mctrl & TIOCM_OUT1)
-		mcr |= UART_MCR_OUT1;
-	if (mctrl & TIOCM_OUT2)
-		mcr |= UART_MCR_OUT2;
-	if (mctrl & TIOCM_LOOP)
-		mcr |= UART_MCR_LOOP;
-
-	siu_write(port, UART_MCR, mcr);
-}
-
-static unsigned int siu_get_mctrl(struct uart_port *port)
-{
-	uint8_t msr;
-	unsigned int mctrl = 0;
-
-	msr = siu_read(port, UART_MSR);
-	if (msr & UART_MSR_DCD)
-		mctrl |= TIOCM_CAR;
-	if (msr & UART_MSR_RI)
-		mctrl |= TIOCM_RNG;
-	if (msr & UART_MSR_DSR)
-		mctrl |= TIOCM_DSR;
-	if (msr & UART_MSR_CTS)
-		mctrl |= TIOCM_CTS;
-
-	return mctrl;
-}
-
-static void siu_stop_tx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_THRI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_start_tx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier |= UART_IER_THRI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_stop_rx(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_RLSI;
-	siu_write(port, UART_IER, ier);
-
-	port->read_status_mask &= ~UART_LSR_DR;
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_enable_ms(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t ier;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	ier = siu_read(port, UART_IER);
-	ier |= UART_IER_MSI;
-	siu_write(port, UART_IER, ier);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_break_ctl(struct uart_port *port, int ctl)
-{
-	unsigned long flags;
-	uint8_t lcr;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	lcr = siu_read(port, UART_LCR);
-	if (ctl == -1)
-		lcr |= UART_LCR_SBC;
-	else
-		lcr &= ~UART_LCR_SBC;
-	siu_write(port, UART_LCR, lcr);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static inline void receive_chars(struct uart_port *port, uint8_t *status)
-{
-	uint8_t lsr, ch;
-	char flag;
-	int max_count = RX_MAX_COUNT;
-
-	lsr = *status;
-
-	do {
-		ch = siu_read(port, UART_RX);
-		port->icount.rx++;
-		flag = TTY_NORMAL;
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-		lsr |= lsr_break_flag[port->line];
-		lsr_break_flag[port->line] = 0;
-#endif
-		if (unlikely(lsr & (UART_LSR_BI | UART_LSR_FE |
-		                    UART_LSR_PE | UART_LSR_OE))) {
-			if (lsr & UART_LSR_BI) {
-				lsr &= ~(UART_LSR_FE | UART_LSR_PE);
-				port->icount.brk++;
-
-				if (uart_handle_break(port))
-					goto ignore_char;
-			}
-
-			if (lsr & UART_LSR_FE)
-				port->icount.frame++;
-			if (lsr & UART_LSR_PE)
-				port->icount.parity++;
-			if (lsr & UART_LSR_OE)
-				port->icount.overrun++;
-
-			lsr &= port->read_status_mask;
-			if (lsr & UART_LSR_BI)
-				flag = TTY_BREAK;
-			if (lsr & UART_LSR_FE)
-				flag = TTY_FRAME;
-			if (lsr & UART_LSR_PE)
-				flag = TTY_PARITY;
-		}
-
-		if (uart_handle_sysrq_char(port, ch))
-			goto ignore_char;
-
-		uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
-
-	ignore_char:
-		lsr = siu_read(port, UART_LSR);
-	} while ((lsr & UART_LSR_DR) && (max_count-- > 0));
-
-	tty_flip_buffer_push(&port->state->port);
-
-	*status = lsr;
-}
-
-static inline void check_modem_status(struct uart_port *port)
-{
-	uint8_t msr;
-
-	msr = siu_read(port, UART_MSR);
-	if ((msr & UART_MSR_ANY_DELTA) == 0)
-		return;
-	if (msr & UART_MSR_DDCD)
-		uart_handle_dcd_change(port, msr & UART_MSR_DCD);
-	if (msr & UART_MSR_TERI)
-		port->icount.rng++;
-	if (msr & UART_MSR_DDSR)
-		port->icount.dsr++;
-	if (msr & UART_MSR_DCTS)
-		uart_handle_cts_change(port, msr & UART_MSR_CTS);
-
-	wake_up_interruptible(&port->state->port.delta_msr_wait);
-}
-
-static inline void transmit_chars(struct uart_port *port)
-{
-	struct circ_buf *xmit;
-	int max_count = TX_MAX_COUNT;
-
-	xmit = &port->state->xmit;
-
-	if (port->x_char) {
-		siu_write(port, UART_TX, port->x_char);
-		port->icount.tx++;
-		port->x_char = 0;
-		return;
-	}
-
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
-		siu_stop_tx(port);
-		return;
-	}
-
-	do {
-		siu_write(port, UART_TX, xmit->buf[xmit->tail]);
-		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		port->icount.tx++;
-		if (uart_circ_empty(xmit))
-			break;
-	} while (max_count-- > 0);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(port);
-
-	if (uart_circ_empty(xmit))
-		siu_stop_tx(port);
-}
-
-static irqreturn_t siu_interrupt(int irq, void *dev_id)
-{
-	struct uart_port *port;
-	uint8_t iir, lsr;
-
-	port = (struct uart_port *)dev_id;
-
-	iir = siu_read(port, UART_IIR);
-	if (iir & UART_IIR_NO_INT)
-		return IRQ_NONE;
-
-	lsr = siu_read(port, UART_LSR);
-	if (lsr & UART_LSR_DR)
-		receive_chars(port, &lsr);
-
-	check_modem_status(port);
-
-	if (lsr & UART_LSR_THRE)
-		transmit_chars(port);
-
-	return IRQ_HANDLED;
-}
-
-static int siu_startup(struct uart_port *port)
-{
-	int retval;
-
-	if (port->membase == NULL)
-		return -ENODEV;
-
-	siu_clear_fifo(port);
-
-	(void)siu_read(port, UART_LSR);
-	(void)siu_read(port, UART_RX);
-	(void)siu_read(port, UART_IIR);
-	(void)siu_read(port, UART_MSR);
-
-	if (siu_read(port, UART_LSR) == 0xff)
-		return -ENODEV;
-
-	retval = request_irq(port->irq, siu_interrupt, 0, siu_type_name(port), port);
-	if (retval)
-		return retval;
-
-	if (port->type == PORT_VR41XX_DSIU)
-		vr41xx_enable_dsiuint(DSIUINT_ALL);
-
-	siu_write(port, UART_LCR, UART_LCR_WLEN8);
-
-	spin_lock_irq(&port->lock);
-	siu_set_mctrl(port, port->mctrl);
-	spin_unlock_irq(&port->lock);
-
-	siu_write(port, UART_IER, UART_IER_RLSI | UART_IER_RDI);
-
-	(void)siu_read(port, UART_LSR);
-	(void)siu_read(port, UART_RX);
-	(void)siu_read(port, UART_IIR);
-	(void)siu_read(port, UART_MSR);
-
-	return 0;
-}
-
-static void siu_shutdown(struct uart_port *port)
-{
-	unsigned long flags;
-	uint8_t lcr;
-
-	siu_write(port, UART_IER, 0);
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	port->mctrl &= ~TIOCM_OUT2;
-	siu_set_mctrl(port, port->mctrl);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	lcr = siu_read(port, UART_LCR);
-	lcr &= ~UART_LCR_SBC;
-	siu_write(port, UART_LCR, lcr);
-
-	siu_clear_fifo(port);
-
-	(void)siu_read(port, UART_RX);
-
-	if (port->type == PORT_VR41XX_DSIU)
-		vr41xx_disable_dsiuint(DSIUINT_ALL);
-
-	free_irq(port->irq, port);
-}
-
-static void siu_set_termios(struct uart_port *port, struct ktermios *new,
-                            struct ktermios *old)
-{
-	tcflag_t c_cflag, c_iflag;
-	uint8_t lcr, fcr, ier;
-	unsigned int baud, quot;
-	unsigned long flags;
-
-	c_cflag = new->c_cflag;
-	lcr = UART_LCR_WLEN(tty_get_char_size(c_cflag));
-
-	if (c_cflag & CSTOPB)
-		lcr |= UART_LCR_STOP;
-	if (c_cflag & PARENB)
-		lcr |= UART_LCR_PARITY;
-	if ((c_cflag & PARODD) != PARODD)
-		lcr |= UART_LCR_EPAR;
-	if (c_cflag & CMSPAR)
-		lcr |= UART_LCR_SPAR;
-
-	baud = uart_get_baud_rate(port, new, old, 0, port->uartclk/16);
-	quot = uart_get_divisor(port, baud);
-
-	fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	uart_update_timeout(port, c_cflag, baud);
-
-	c_iflag = new->c_iflag;
-
-	port->read_status_mask = UART_LSR_THRE | UART_LSR_OE | UART_LSR_DR;
-	if (c_iflag & INPCK)
-		port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
-	if (c_iflag & (IGNBRK | BRKINT | PARMRK))
-		port->read_status_mask |= UART_LSR_BI;
-
-	port->ignore_status_mask = 0;
-	if (c_iflag & IGNPAR)
-		port->ignore_status_mask |= UART_LSR_FE | UART_LSR_PE;
-	if (c_iflag & IGNBRK) {
-		port->ignore_status_mask |= UART_LSR_BI;
-		if (c_iflag & IGNPAR)
-			port->ignore_status_mask |= UART_LSR_OE;
-	}
-
-	if ((c_cflag & CREAD) == 0)
-		port->ignore_status_mask |= UART_LSR_DR;
-
-	ier = siu_read(port, UART_IER);
-	ier &= ~UART_IER_MSI;
-	if (UART_ENABLE_MS(port, c_cflag))
-		ier |= UART_IER_MSI;
-	siu_write(port, UART_IER, ier);
-
-	siu_write(port, UART_LCR, lcr | UART_LCR_DLAB);
-
-	siu_write(port, UART_DLL, (uint8_t)quot);
-	siu_write(port, UART_DLM, (uint8_t)(quot >> 8));
-
-	siu_write(port, UART_LCR, lcr);
-
-	siu_write(port, UART_FCR, fcr);
-
-	siu_set_mctrl(port, port->mctrl);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static void siu_pm(struct uart_port *port, unsigned int state, unsigned int oldstate)
-{
-	switch (state) {
-	case 0:
-		switch (port->type) {
-		case PORT_VR41XX_SIU:
-			vr41xx_supply_clock(SIU_CLOCK);
-			break;
-		case PORT_VR41XX_DSIU:
-			vr41xx_supply_clock(DSIU_CLOCK);
-			break;
-		}
-		break;
-	case 3:
-		switch (port->type) {
-		case PORT_VR41XX_SIU:
-			vr41xx_mask_clock(SIU_CLOCK);
-			break;
-		case PORT_VR41XX_DSIU:
-			vr41xx_mask_clock(DSIU_CLOCK);
-			break;
-		}
-		break;
-	}
-}
-
-static const char *siu_type(struct uart_port *port)
-{
-	return siu_type_name(port);
-}
-
-static void siu_release_port(struct uart_port *port)
-{
-	unsigned long size;
-
-	if (port->flags	& UPF_IOREMAP) {
-		iounmap(port->membase);
-		port->membase = NULL;
-	}
-
-	size = siu_port_size(port);
-	release_mem_region(port->mapbase, size);
-}
-
-static int siu_request_port(struct uart_port *port)
-{
-	unsigned long size;
-	struct resource *res;
-
-	size = siu_port_size(port);
-	res = request_mem_region(port->mapbase, size, siu_type_name(port));
-	if (res == NULL)
-		return -EBUSY;
-
-	if (port->flags & UPF_IOREMAP) {
-		port->membase = ioremap(port->mapbase, size);
-		if (port->membase == NULL) {
-			release_resource(res);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static void siu_config_port(struct uart_port *port, int flags)
-{
-	if (flags & UART_CONFIG_TYPE) {
-		port->type = siu_check_type(port);
-		(void)siu_request_port(port);
-	}
-}
-
-static int siu_verify_port(struct uart_port *port, struct serial_struct *serial)
-{
-	if (port->type != PORT_VR41XX_SIU && port->type != PORT_VR41XX_DSIU)
-		return -EINVAL;
-	if (port->irq != serial->irq)
-		return -EINVAL;
-	if (port->iotype != serial->io_type)
-		return -EINVAL;
-	if (port->mapbase != (unsigned long)serial->iomem_base)
-		return -EINVAL;
-
-	return 0;
-}
-
-static const struct uart_ops siu_uart_ops = {
-	.tx_empty	= siu_tx_empty,
-	.set_mctrl	= siu_set_mctrl,
-	.get_mctrl	= siu_get_mctrl,
-	.stop_tx	= siu_stop_tx,
-	.start_tx	= siu_start_tx,
-	.stop_rx	= siu_stop_rx,
-	.enable_ms	= siu_enable_ms,
-	.break_ctl	= siu_break_ctl,
-	.startup	= siu_startup,
-	.shutdown	= siu_shutdown,
-	.set_termios	= siu_set_termios,
-	.pm		= siu_pm,
-	.type		= siu_type,
-	.release_port	= siu_release_port,
-	.request_port	= siu_request_port,
-	.config_port	= siu_config_port,
-	.verify_port	= siu_verify_port,
-};
-
-static int siu_init_ports(struct platform_device *pdev)
-{
-	struct uart_port *port;
-	struct resource *res;
-	int *type = dev_get_platdata(&pdev->dev);
-	int i;
-
-	if (!type)
-		return 0;
-
-	port = siu_uart_ports;
-	for (i = 0; i < SIU_PORTS_MAX; i++) {
-		port->type = type[i];
-		if (port->type == PORT_UNKNOWN)
-			continue;
-		port->irq = platform_get_irq(pdev, i);
-		port->uartclk = SIU_BAUD_BASE * 16;
-		port->fifosize = 16;
-		port->regshift = 0;
-		port->iotype = UPIO_MEM;
-		port->flags = UPF_IOREMAP | UPF_BOOT_AUTOCONF;
-		port->line = i;
-		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		port->mapbase = res->start;
-		port++;
-	}
-
-	return i;
-}
-
-#ifdef CONFIG_SERIAL_VR41XX_CONSOLE
-
-static void wait_for_xmitr(struct uart_port *port)
-{
-	int timeout = 10000;
-	uint8_t lsr, msr;
-
-	do {
-		lsr = siu_read(port, UART_LSR);
-		if (lsr & UART_LSR_BI)
-			lsr_break_flag[port->line] = UART_LSR_BI;
-
-		if (uart_lsr_tx_empty(lsr))
-			break;
-	} while (timeout-- > 0);
-
-	if (port->flags & UPF_CONS_FLOW) {
-		timeout = 1000000;
-
-		do {
-			msr = siu_read(port, UART_MSR);
-			if ((msr & UART_MSR_CTS) != 0)
-				break;
-		} while (timeout-- > 0);
-	}
-}
-
-static void siu_console_putchar(struct uart_port *port, unsigned char ch)
-{
-	wait_for_xmitr(port);
-	siu_write(port, UART_TX, ch);
-}
-
-static void siu_console_write(struct console *con, const char *s, unsigned count)
-{
-	struct uart_port *port;
-	uint8_t ier;
-
-	port = &siu_uart_ports[con->index];
-
-	ier = siu_read(port, UART_IER);
-	siu_write(port, UART_IER, 0);
-
-	uart_console_write(port, s, count, siu_console_putchar);
-
-	wait_for_xmitr(port);
-	siu_write(port, UART_IER, ier);
-}
-
-static int __init siu_console_setup(struct console *con, char *options)
-{
-	struct uart_port *port;
-	int baud = 9600;
-	int parity = 'n';
-	int bits = 8;
-	int flow = 'n';
-
-	if (con->index >= SIU_PORTS_MAX)
-		con->index = 0;
-
-	port = &siu_uart_ports[con->index];
-	if (port->membase == NULL) {
-		if (port->mapbase == 0)
-			return -ENODEV;
-		port->membase = ioremap(port->mapbase, siu_port_size(port));
-	}
-
-	if (port->type == PORT_VR41XX_SIU)
-		vr41xx_select_siu_interface(SIU_INTERFACE_RS232C);
-
-	if (options != NULL)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-
-	return uart_set_options(port, con, baud, parity, bits, flow);
-}
-
-static struct uart_driver siu_uart_driver;
-
-static struct console siu_console = {
-	.name	= "ttyVR",
-	.write	= siu_console_write,
-	.device	= uart_console_device,
-	.setup	= siu_console_setup,
-	.flags	= CON_PRINTBUFFER,
-	.index	= -1,
-	.data	= &siu_uart_driver,
-};
-
-static int siu_console_init(void)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < SIU_PORTS_MAX; i++) {
-		port = &siu_uart_ports[i];
-		port->ops = &siu_uart_ops;
-	}
-
-	register_console(&siu_console);
-
-	return 0;
-}
-
-console_initcall(siu_console_init);
-
-void __init vr41xx_siu_early_setup(struct uart_port *port)
-{
-	if (port->type == PORT_UNKNOWN)
-		return;
-
-	siu_uart_ports[port->line].line = port->line;
-	siu_uart_ports[port->line].type = port->type;
-	siu_uart_ports[port->line].uartclk = SIU_BAUD_BASE * 16;
-	siu_uart_ports[port->line].mapbase = port->mapbase;
-	siu_uart_ports[port->line].ops = &siu_uart_ops;
-}
-
-#define SERIAL_VR41XX_CONSOLE	&siu_console
-#else
-#define SERIAL_VR41XX_CONSOLE	NULL
-#endif
-
-static struct uart_driver siu_uart_driver = {
-	.owner		= THIS_MODULE,
-	.driver_name	= "SIU",
-	.dev_name	= "ttyVR",
-	.major		= SIU_MAJOR,
-	.minor		= SIU_MINOR_BASE,
-	.cons		= SERIAL_VR41XX_CONSOLE,
-};
-
-static int siu_probe(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int num, i, retval;
-
-	num = siu_init_ports(dev);
-	if (num <= 0)
-		return -ENODEV;
-
-	siu_uart_driver.nr = num;
-	retval = uart_register_driver(&siu_uart_driver);
-	if (retval)
-		return retval;
-
-	for (i = 0; i < num; i++) {
-		port = &siu_uart_ports[i];
-		port->ops = &siu_uart_ops;
-		port->dev = &dev->dev;
-		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_VR41XX_CONSOLE);
-
-		retval = uart_add_one_port(&siu_uart_driver, port);
-		if (retval < 0) {
-			port->dev = NULL;
-			break;
-		}
-	}
-
-	if (i == 0 && retval < 0) {
-		uart_unregister_driver(&siu_uart_driver);
-		return retval;
-	}
-
-	return 0;
-}
-
-static int siu_remove(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if (port->dev == &dev->dev) {
-			uart_remove_one_port(&siu_uart_driver, port);
-			port->dev = NULL;
-		}
-	}
-
-	uart_unregister_driver(&siu_uart_driver);
-
-	return 0;
-}
-
-static int siu_suspend(struct platform_device *dev, pm_message_t state)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if ((port->type == PORT_VR41XX_SIU ||
-		     port->type == PORT_VR41XX_DSIU) && port->dev == &dev->dev)
-			uart_suspend_port(&siu_uart_driver, port);
-
-	}
-
-	return 0;
-}
-
-static int siu_resume(struct platform_device *dev)
-{
-	struct uart_port *port;
-	int i;
-
-	for (i = 0; i < siu_uart_driver.nr; i++) {
-		port = &siu_uart_ports[i];
-		if ((port->type == PORT_VR41XX_SIU ||
-		     port->type == PORT_VR41XX_DSIU) && port->dev == &dev->dev)
-			uart_resume_port(&siu_uart_driver, port);
-	}
-
-	return 0;
-}
-
-static struct platform_driver siu_device_driver = {
-	.probe		= siu_probe,
-	.remove		= siu_remove,
-	.suspend	= siu_suspend,
-	.resume		= siu_resume,
-	.driver		= {
-		.name	= "SIU",
-	},
-};
-
-module_platform_driver(siu_device_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:SIU");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 6faf502b7860..3ba34d8378bd 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -124,10 +124,6 @@
 /* TXX9 type number */
 #define PORT_TXX9	64
 
-/* NEC VR4100 series SIU/DSIU */
-#define PORT_VR41XX_SIU		65
-#define PORT_VR41XX_DSIU	66
-
 /* Samsung S3C2400 SoC */
 #define PORT_S3C2400	67
 
-- 
cgit v1.2.3


From 3c8e19d3d3f9a20cde987fa73fd83b13dcc8604f Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Wed, 6 Jul 2022 19:28:55 +0100
Subject: media: Add P010 tiled format

Add P010 tiled format

[rebased, updated pixel format name and added description]

Tested-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../userspace-api/media/v4l/pixfmt-yuv-planar.rst      | 18 +++++++++++++++---
 drivers/media/v4l2-core/v4l2-common.c                  |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c                   |  1 +
 include/uapi/linux/videodev2.h                         |  1 +
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
index 997ce2d094fc..10b1feeb0b57 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst
@@ -116,6 +116,13 @@ All components are stored with the same number of bits per component.
       - Cb, Cr
       - Yes
       - Linear
+    * - V4L2_PIX_FMT_P010_4L4
+      - 'T010'
+      - 10
+      - 4:2:0
+      - Cb, Cr
+      - Yes
+      - 4x4 tiles
     * - V4L2_PIX_FMT_NV16
       - 'NV16'
       - 8
@@ -528,11 +535,12 @@ number of lines as the luma plane.
       - Cr\ :sub:`33`
 
 .. _V4L2_PIX_FMT_P010:
+.. _V4L2-PIX-FMT-P010-4L4:
 
-P010
-----
+P010 and tiled P010
+-------------------
 
-Like NV12 with 10 bits per component, expanded to 16 bits.
+P010 is like NV12 with 10 bits per component, expanded to 16 bits.
 Data in the 10 high bits, zeros in the 6 low bits, arranged in little endian order.
 
 .. flat-table:: Sample 4x4 P010 Image
@@ -589,6 +597,10 @@ relationship between the luma and chroma line padding and stride.
 
 All components are stored with the same number of bits per component.
 
+``V4L2_PIX_FMT_P010_4L4`` stores pixels in 4x4 tiles, and stores tiles linearly
+in memory. The line stride must be aligned to multiple of 8 and image height to
+a multiple of 4. The layouts of the luma and chroma planes are identical.
+
 .. raw:: latex
 
     \small
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 1e38ad8906a2..e0fbe6ba4b6c 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -278,6 +278,7 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 
 		/* Tiled YUV formats */
 		{ .format = V4L2_PIX_FMT_NV12_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 },
+		{ .format = V4L2_PIX_FMT_P010_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 4, 0, 0 }, .hdiv = 2, .vdiv = 2 },
 
 		/* YUV planar formats, non contiguous variant */
 		{ .format = V4L2_PIX_FMT_YUV420M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 },
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index d9b6b8a678bc..c314025d977e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1357,6 +1357,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV12_4L4:	descr = "Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_16L16:	descr = "Y/CbCr 4:2:0 (16x16 Linear)"; break;
 	case V4L2_PIX_FMT_NV12_32L32:   descr = "Y/CbCr 4:2:0 (32x32 Linear)"; break;
+	case V4L2_PIX_FMT_P010_4L4:	descr = "10-bit Y/CbCr 4:2:0 (4x4 Linear)"; break;
 	case V4L2_PIX_FMT_NV12M:	descr = "Y/CbCr 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_NV21M:	descr = "Y/CrCb 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_NV16M:	descr = "Y/CbCr 4:2:2 (N-C)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d6fac2344033..01e630f2ec78 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -654,6 +654,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_NV12_4L4 v4l2_fourcc('V', 'T', '1', '2')   /* 12  Y/CbCr 4:2:0  4x4 tiles */
 #define V4L2_PIX_FMT_NV12_16L16 v4l2_fourcc('H', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 16x16 tiles */
 #define V4L2_PIX_FMT_NV12_32L32 v4l2_fourcc('S', 'T', '1', '2') /* 12  Y/CbCr 4:2:0 32x32 tiles */
+#define V4L2_PIX_FMT_P010_4L4 v4l2_fourcc('T', '0', '1', '0') /* 12  Y/CbCr 4:2:0 10-bit 4x4 macroblocks */
 
 /* Tiled YUV formats, non contiguous planes */
 #define V4L2_PIX_FMT_NV12MT  v4l2_fourcc('T', 'M', '1', '2') /* 12  Y/CbCr 4:2:0 64x32 tiles */
-- 
cgit v1.2.3


From ddefb2d205539418f3c3851a3e06fac9624f257d Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Thu, 14 Jul 2022 17:44:05 +0800
Subject: net/smc: Extend SMC-R link group netlink attribute

Extend SMC-R link group netlink attribute SMC_GEN_LGR_SMCR.
Introduce SMC_NLA_LGR_R_BUF_TYPE to show the buffer type of
SMC-R link group.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/smc.h | 1 +
 net/smc/smc_core.c       | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 693f549f6966..bb4dacca31e7 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -124,6 +124,7 @@ enum {
 	SMC_NLA_LGR_R_V2,		/* nest */
 	SMC_NLA_LGR_R_NET_COOKIE,	/* u64 */
 	SMC_NLA_LGR_R_PAD,		/* flag */
+	SMC_NLA_LGR_R_BUF_TYPE,		/* u8 */
 	__SMC_NLA_LGR_R_MAX,
 	SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1
 };
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f26770c29d78..ff49a11f57b8 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -347,6 +347,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr,
 		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type))
 		goto errattr;
+	if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type))
+		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
 		goto errattr;
 	if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE,
-- 
cgit v1.2.3


From d276a22314c2bad9136c5e0b09eb3c8a560e1161 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 Jul 2022 08:30:13 +0200
Subject: ublk: remove UBLK_IO_F_INTEGRITY

The ublk protocol has no mechanism to actually transfer the integrity
metadata, so don't define this flag, which requires that an integrity
payload is attached to a bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220718063013.335531-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 3 ---
 include/uapi/linux/ublk_cmd.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 42afab25864f..796d8230fb60 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -389,9 +389,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 	if (req->cmd_flags & REQ_META)
 		flags |= UBLK_IO_F_META;
 
-	if (req->cmd_flags & REQ_INTEGRITY)
-		flags |= UBLK_IO_F_INTEGRITY;
-
 	if (req->cmd_flags & REQ_FUA)
 		flags |= UBLK_IO_F_FUA;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a3f5e7c21807..d6879eea2fde 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -106,7 +106,6 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_F_FAILFAST_TRANSPORT	(1U << 9)
 #define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
 #define		UBLK_IO_F_META			(1U << 11)
-#define		UBLK_IO_F_INTEGRITY		(1U << 12)
 #define		UBLK_IO_F_FUA			(1U << 13)
 #define		UBLK_IO_F_PREFLUSH		(1U << 14)
 #define		UBLK_IO_F_NOUNMAP		(1U << 15)
-- 
cgit v1.2.3


From 450a563924ae9437758bd468c5b7cee9468ce749 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Tue, 19 Jul 2022 12:52:29 +0000
Subject: KVM: stats: Fix value for KVM_STATS_UNIT_MAX for boolean stats

commit 1b870fa5573e ("kvm: stats: tell userspace which values are
boolean") added a new stat unit (boolean) but failed to raise
KVM_STATS_UNIT_MAX.

Fix by pointing UNIT_MAX at the new max value of UNIT_BOOLEAN.

Fixes: 1b870fa5573e ("kvm: stats: tell userspace which values are boolean")
Reported-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Signed-off-by: Oliver Upton <oupton@google.com>
Message-Id: <20220719125229.2934273-1-oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 2 +-
 include/uapi/linux/kvm.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 6e090fb96a0e..98a283930307 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5658,7 +5658,7 @@ by a string of size ``name_size``.
 	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
-	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
+	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_BOOLEAN
 
 	#define KVM_STATS_BASE_SHIFT		8
 	#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 811897dadcae..860f867c50c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2084,7 +2084,7 @@ struct kvm_stats_header {
 #define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
 #define KVM_STATS_UNIT_BOOLEAN		(0x4 << KVM_STATS_UNIT_SHIFT)
-#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
+#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_BOOLEAN
 
 #define KVM_STATS_BASE_SHIFT		8
 #define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
-- 
cgit v1.2.3


From bdb2bc7599298ebb677e40fc92b1fa9e69e05098 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Fri, 15 Jul 2022 12:38:00 -0700
Subject: bpf: fix bpf_skb_pull_data documentation

Fix documentation for bpf_skb_pull_data() helper for
when len == 0.

Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/20220715193800.3940070-1-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 3 ++-
 tools/include/uapi/linux/bpf.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 379e68fb866f..ffcbf79a556b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2361,7 +2361,8 @@ union bpf_attr {
  * 		Pull in non-linear data in case the *skb* is non-linear and not
  * 		all of *len* are part of the linear section. Make *len* bytes
  * 		from *skb* readable and writable. If a zero value is passed for
- * 		*len*, then the whole length of the *skb* is pulled.
+ *		*len*, then all bytes in the linear part of *skb* will be made
+ *		readable and writable.
  *
  * 		This helper is only needed for reading and writing with direct
  * 		packet access.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 379e68fb866f..ffcbf79a556b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2361,7 +2361,8 @@ union bpf_attr {
  * 		Pull in non-linear data in case the *skb* is non-linear and not
  * 		all of *len* are part of the linear section. Make *len* bytes
  * 		from *skb* readable and writable. If a zero value is passed for
- * 		*len*, then the whole length of the *skb* is pulled.
+ *		*len*, then all bytes in the linear part of *skb* will be made
+ *		readable and writable.
  *
  * 		This helper is only needed for reading and writing with direct
  * 		packet access.
-- 
cgit v1.2.3


From 9d24322e887b6a3d3f9f9c3e76937a646102c8c1 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Jul 2022 13:52:46 -0700
Subject: PCI/DOE: Add DOE mailbox support functions

Introduced in a PCIe r6.0, sec 6.30, DOE provides a config space based
mailbox with standard protocol discovery.  Each mailbox is accessed
through a DOE Extended Capability.

Each DOE mailbox must support the DOE discovery protocol in addition to
any number of additional protocols.

Define core PCIe functionality to manage a single PCIe DOE mailbox at a
defined config space offset.  Functionality includes iterating,
creating, query of supported protocol, and task submission.  Destruction
of the mailboxes is device managed.

Cc: "Li, Ming" <ming4.li@intel.com>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Acked-by: Bjorn Helgaas <helgaas@kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20220719205249.566684-4-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format                 |   1 +
 drivers/pci/Kconfig           |   3 +
 drivers/pci/Makefile          |   1 +
 drivers/pci/doe.c             | 536 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-doe.h       |  77 ++++++
 include/uapi/linux/pci_regs.h |  29 ++-
 6 files changed, 646 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/doe.c
 create mode 100644 include/linux/pci-doe.h

(limited to 'include/uapi/linux')

diff --git a/.clang-format b/.clang-format
index 9b87ea1fc16e..1247d54f9e49 100644
--- a/.clang-format
+++ b/.clang-format
@@ -516,6 +516,7 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
+  - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
   - 'pcm_for_each_format'
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 133c73207782..b2f2e588a817 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -121,6 +121,9 @@ config XEN_PCIDEV_FRONTEND
 config PCI_ATS
 	bool
 
+config PCI_DOE
+	bool
+
 config PCI_ECAM
 	bool
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0da6b1ebc694..2680e4c92f0a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_PCI_ECAM)		+= ecam.o
 obj-$(CONFIG_PCI_P2PDMA)	+= p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
+obj-$(CONFIG_PCI_DOE)		+= doe.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
new file mode 100644
index 000000000000..e402f05068a5
--- /dev/null
+++ b/drivers/pci/doe.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *	Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#define dev_fmt(fmt) "DOE: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/workqueue.h>
+
+#define PCI_DOE_PROTOCOL_DISCOVERY 0
+
+/* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
+#define PCI_DOE_TIMEOUT HZ
+#define PCI_DOE_POLL_INTERVAL	(PCI_DOE_TIMEOUT / 128)
+
+#define PCI_DOE_FLAG_CANCEL	0
+#define PCI_DOE_FLAG_DEAD	1
+
+/**
+ * struct pci_doe_mb - State for a single DOE mailbox
+ *
+ * This state is used to manage a single DOE mailbox capability.  All fields
+ * should be considered opaque to the consumers and the structure passed into
+ * the helpers below after being created by devm_pci_doe_create()
+ *
+ * @pdev: PCI device this mailbox belongs to
+ * @cap_offset: Capability offset
+ * @prots: Array of protocols supported (encoded as long values)
+ * @wq: Wait queue for work item
+ * @work_queue: Queue of pci_doe_work items
+ * @flags: Bit array of PCI_DOE_FLAG_* flags
+ */
+struct pci_doe_mb {
+	struct pci_dev *pdev;
+	u16 cap_offset;
+	struct xarray prots;
+
+	wait_queue_head_t wq;
+	struct workqueue_struct *work_queue;
+	unsigned long flags;
+};
+
+static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
+{
+	if (wait_event_timeout(doe_mb->wq,
+			       test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags),
+			       timeout))
+		return -EIO;
+	return 0;
+}
+
+static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val);
+}
+
+static int pci_doe_abort(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+
+	pci_dbg(pdev, "[%x] Issuing Abort\n", offset);
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_ABORT);
+
+	do {
+		int rc;
+		u32 val;
+
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc)
+			return rc;
+		pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+
+		/* Abort success! */
+		if (!FIELD_GET(PCI_DOE_STATUS_ERROR, val) &&
+		    !FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+			return 0;
+
+	} while (!time_after(jiffies, timeout_jiffies));
+
+	/* Abort has timed out and the MB is dead */
+	pci_err(pdev, "[%x] ABORT timed out\n", offset);
+	return -EIO;
+}
+
+static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
+			    struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+	int i;
+
+	/*
+	 * Check the DOE busy bit is not set. If it is set, this could indicate
+	 * someone other than Linux (e.g. firmware) is using the mailbox. Note
+	 * it is expected that firmware and OS will negotiate access rights via
+	 * an, as yet to be defined, method.
+	 */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+		return -EBUSY;
+
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	/* Write DOE Header */
+	val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) |
+		FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type);
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val);
+	/* Length is 2 DW of header + length of payload in DW */
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+			       FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH,
+					  2 + task->request_pl_sz /
+						sizeof(u32)));
+	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
+		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+				       task->request_pl[i]);
+
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
+
+	return 0;
+}
+
+static bool pci_doe_data_obj_ready(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val))
+		return true;
+	return false;
+}
+
+static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	size_t length, payload_length;
+	u32 val;
+	int i;
+
+	/* Read the first dword to get the protocol */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	if ((FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val) != task->prot.vid) ||
+	    (FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val) != task->prot.type)) {
+		dev_err_ratelimited(&pdev->dev, "[%x] expected [VID, Protocol] = [%04x, %02x], got [%04x, %02x]\n",
+				    doe_mb->cap_offset, task->prot.vid, task->prot.type,
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val),
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val));
+		return -EIO;
+	}
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	/* Read the second dword to get the length */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+
+	length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val);
+	if (length > SZ_1M || length < 2)
+		return -EIO;
+
+	/* First 2 dwords have already been read */
+	length -= 2;
+	payload_length = min(length, task->response_pl_sz / sizeof(u32));
+	/* Read the rest of the response payload */
+	for (i = 0; i < payload_length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
+				      &task->response_pl[i]);
+		/* Prior to the last ack, ensure Data Object Ready */
+		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
+			return -EIO;
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Flush excess length */
+	for (; i < length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Final error check to pick up on any since Data Object Ready */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32);
+}
+
+static void signal_task_complete(struct pci_doe_task *task, int rv)
+{
+	task->rv = rv;
+	task->complete(task);
+}
+
+static void signal_task_abort(struct pci_doe_task *task, int rv)
+{
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+
+	if (pci_doe_abort(doe_mb)) {
+		/*
+		 * If the device can't process an abort; set the mailbox dead
+		 *	- no more submissions
+		 */
+		pci_err(pdev, "[%x] Abort failed marking mailbox dead\n",
+			doe_mb->cap_offset);
+		set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+	}
+	signal_task_complete(task, rv);
+}
+
+static void doe_statemachine_work(struct work_struct *work)
+{
+	struct pci_doe_task *task = container_of(work, struct pci_doe_task,
+						 work);
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+	u32 val;
+	int rc;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
+		signal_task_complete(task, -EIO);
+		return;
+	}
+
+	/* Send request */
+	rc = pci_doe_send_req(doe_mb, task);
+	if (rc) {
+		/*
+		 * The specification does not provide any guidance on how to
+		 * resolve conflicting requests from other entities.
+		 * Furthermore, it is likely that busy will not be detected
+		 * most of the time.  Flag any detection of status busy with an
+		 * error.
+		 */
+		if (rc == -EBUSY)
+			dev_err_ratelimited(&pdev->dev, "[%x] busy detected; another entity is sending conflicting requests\n",
+					    offset);
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	/* Poll for response */
+retry_resp:
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) {
+		signal_task_abort(task, -EIO);
+		return;
+	}
+
+	if (!FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) {
+		if (time_after(jiffies, timeout_jiffies)) {
+			signal_task_abort(task, -EIO);
+			return;
+		}
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc) {
+			signal_task_abort(task, rc);
+			return;
+		}
+		goto retry_resp;
+	}
+
+	rc  = pci_doe_recv_resp(doe_mb, task);
+	if (rc < 0) {
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	signal_task_complete(task, rc);
+}
+
+static void pci_doe_task_complete(struct pci_doe_task *task)
+{
+	complete(task->private);
+}
+
+static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
+			     u8 *protocol)
+{
+	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
+				    *index);
+	u32 response_pl;
+	DECLARE_COMPLETION_ONSTACK(c);
+	struct pci_doe_task task = {
+		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
+		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
+		.request_pl = &request_pl,
+		.request_pl_sz = sizeof(request_pl),
+		.response_pl = &response_pl,
+		.response_pl_sz = sizeof(response_pl),
+		.complete = pci_doe_task_complete,
+		.private = &c,
+	};
+	int rc;
+
+	rc = pci_doe_submit_task(doe_mb, &task);
+	if (rc < 0)
+		return rc;
+
+	wait_for_completion(&c);
+
+	if (task.rv != sizeof(response_pl))
+		return -EIO;
+
+	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
+	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
+			      response_pl);
+	*index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX,
+			   response_pl);
+
+	return 0;
+}
+
+static void *pci_doe_xa_prot_entry(u16 vid, u8 prot)
+{
+	return xa_mk_value((vid << 8) | prot);
+}
+
+static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb)
+{
+	u8 index = 0;
+	u8 xa_idx = 0;
+
+	do {
+		int rc;
+		u16 vid;
+		u8 prot;
+
+		rc = pci_doe_discovery(doe_mb, &index, &vid, &prot);
+		if (rc)
+			return rc;
+
+		pci_dbg(doe_mb->pdev,
+			"[%x] Found protocol %d vid: %x prot: %x\n",
+			doe_mb->cap_offset, xa_idx, vid, prot);
+
+		rc = xa_insert(&doe_mb->prots, xa_idx++,
+			       pci_doe_xa_prot_entry(vid, prot), GFP_KERNEL);
+		if (rc)
+			return rc;
+	} while (index);
+
+	return 0;
+}
+
+static void pci_doe_xa_destroy(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	xa_destroy(&doe_mb->prots);
+}
+
+static void pci_doe_destroy_workqueue(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	destroy_workqueue(doe_mb->work_queue);
+}
+
+static void pci_doe_flush_mb(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	/* Stop all pending work items from starting */
+	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+
+	/* Cancel an in progress work item, if necessary */
+	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
+	wake_up(&doe_mb->wq);
+
+	/* Flush all work items */
+	flush_workqueue(doe_mb->work_queue);
+}
+
+/**
+ * pcim_doe_create_mb() - Create a DOE mailbox object
+ *
+ * @pdev: PCI device to create the DOE mailbox for
+ * @cap_offset: Offset of the DOE mailbox
+ *
+ * Create a single mailbox object to manage the mailbox protocol at the
+ * cap_offset specified.
+ *
+ * RETURNS: created mailbox object on success
+ *	    ERR_PTR(-errno) on failure
+ */
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset)
+{
+	struct pci_doe_mb *doe_mb;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	doe_mb = devm_kzalloc(dev, sizeof(*doe_mb), GFP_KERNEL);
+	if (!doe_mb)
+		return ERR_PTR(-ENOMEM);
+
+	doe_mb->pdev = pdev;
+	doe_mb->cap_offset = cap_offset;
+	init_waitqueue_head(&doe_mb->wq);
+
+	xa_init(&doe_mb->prots);
+	rc = devm_add_action(dev, pci_doe_xa_destroy, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0,
+						dev_driver_string(&pdev->dev),
+						pci_name(pdev),
+						doe_mb->cap_offset);
+	if (!doe_mb->work_queue) {
+		pci_err(pdev, "[%x] failed to allocate work queue\n",
+			doe_mb->cap_offset);
+		return ERR_PTR(-ENOMEM);
+	}
+	rc = devm_add_action_or_reset(dev, pci_doe_destroy_workqueue, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* Reset the mailbox by issuing an abort */
+	rc = pci_doe_abort(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to reset mailbox with abort command : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The state machine and the mailbox should be in sync now;
+	 * Set up mailbox flush prior to using the mailbox to query protocols.
+	 */
+	rc = devm_add_action_or_reset(dev, pci_doe_flush_mb, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = pci_doe_cache_protocols(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to cache protocols : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	return doe_mb;
+}
+EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
+
+/**
+ * pci_doe_supports_prot() - Return if the DOE instance supports the given
+ *			     protocol
+ * @doe_mb: DOE mailbox capability to query
+ * @vid: Protocol Vendor ID
+ * @type: Protocol type
+ *
+ * RETURNS: True if the DOE mailbox supports the protocol specified
+ */
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
+{
+	unsigned long index;
+	void *entry;
+
+	/* The discovery protocol must always be supported */
+	if (vid == PCI_VENDOR_ID_PCI_SIG && type == PCI_DOE_PROTOCOL_DISCOVERY)
+		return true;
+
+	xa_for_each(&doe_mb->prots, index, entry)
+		if (entry == pci_doe_xa_prot_entry(vid, type))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
+
+/**
+ * pci_doe_submit_task() - Submit a task to be processed by the state machine
+ *
+ * @doe_mb: DOE mailbox capability to submit to
+ * @task: task to be queued
+ *
+ * Submit a DOE task (request/response) to the DOE mailbox to be processed.
+ * Returns upon queueing the task object.  If the queue is full this function
+ * will sleep until there is room in the queue.
+ *
+ * task->complete will be called when the state machine is done processing this
+ * task.
+ *
+ * Excess data will be discarded.
+ *
+ * RETURNS: 0 when task has been successfully queued, -ERRNO on error
+ */
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type))
+		return -EINVAL;
+
+	/*
+	 * DOE requests must be a whole number of DW and the response needs to
+	 * be big enough for at least 1 DW
+	 */
+	if (task->request_pl_sz % sizeof(u32) ||
+	    task->response_pl_sz < sizeof(u32))
+		return -EINVAL;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
+		return -EIO;
+
+	task->doe_mb = doe_mb;
+	INIT_WORK(&task->work, doe_statemachine_work);
+	queue_work(doe_mb->work_queue, &task->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_doe_submit_task);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
new file mode 100644
index 000000000000..ed9b4df792b8
--- /dev/null
+++ b/include/linux/pci-doe.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *     Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#ifndef LINUX_PCI_DOE_H
+#define LINUX_PCI_DOE_H
+
+struct pci_doe_protocol {
+	u16 vid;
+	u8 type;
+};
+
+struct pci_doe_mb;
+
+/**
+ * struct pci_doe_task - represents a single query/response
+ *
+ * @prot: DOE Protocol
+ * @request_pl: The request payload
+ * @request_pl_sz: Size of the request payload (bytes)
+ * @response_pl: The response payload
+ * @response_pl_sz: Size of the response payload (bytes)
+ * @rv: Return value.  Length of received response or error (bytes)
+ * @complete: Called when task is complete
+ * @private: Private data for the consumer
+ * @work: Used internally by the mailbox
+ * @doe_mb: Used internally by the mailbox
+ *
+ * The payload sizes and rv are specified in bytes with the following
+ * restrictions concerning the protocol.
+ *
+ *	1) The request_pl_sz must be a multiple of double words (4 bytes)
+ *	2) The response_pl_sz must be >= a single double word (4 bytes)
+ *	3) rv is returned as bytes but it will be a multiple of double words
+ *
+ * NOTE there is no need for the caller to initialize work or doe_mb.
+ */
+struct pci_doe_task {
+	struct pci_doe_protocol prot;
+	u32 *request_pl;
+	size_t request_pl_sz;
+	u32 *response_pl;
+	size_t response_pl_sz;
+	int rv;
+	void (*complete)(struct pci_doe_task *task);
+	void *private;
+
+	/* No need for the user to initialize these fields */
+	struct work_struct work;
+	struct pci_doe_mb *doe_mb;
+};
+
+/**
+ * pci_doe_for_each_off - Iterate each DOE capability
+ * @pdev: struct pci_dev to iterate
+ * @off: u16 of config space offset of each mailbox capability found
+ */
+#define pci_doe_for_each_off(pdev, off) \
+	for (off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE); \
+		off > 0; \
+		off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE))
+
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
+
+#endif
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 108f8523fa04..57b8e2ffb1dd 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -737,7 +737,8 @@
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
+#define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1103,4 +1104,30 @@
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK		0x000000F0
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT	4
 
+/* Data Object Exchange */
+#define PCI_DOE_CAP		0x04    /* DOE Capabilities Register */
+#define  PCI_DOE_CAP_INT_SUP			0x00000001  /* Interrupt Support */
+#define  PCI_DOE_CAP_INT_MSG_NUM		0x00000ffe  /* Interrupt Message Number */
+#define PCI_DOE_CTRL		0x08    /* DOE Control Register */
+#define  PCI_DOE_CTRL_ABORT			0x00000001  /* DOE Abort */
+#define  PCI_DOE_CTRL_INT_EN			0x00000002  /* DOE Interrupt Enable */
+#define  PCI_DOE_CTRL_GO			0x80000000  /* DOE Go */
+#define PCI_DOE_STATUS		0x0c    /* DOE Status Register */
+#define  PCI_DOE_STATUS_BUSY			0x00000001  /* DOE Busy */
+#define  PCI_DOE_STATUS_INT_STATUS		0x00000002  /* DOE Interrupt Status */
+#define  PCI_DOE_STATUS_ERROR			0x00000004  /* DOE Error */
+#define  PCI_DOE_STATUS_DATA_OBJECT_READY	0x80000000  /* Data Object Ready */
+#define PCI_DOE_WRITE		0x10    /* DOE Write Data Mailbox Register */
+#define PCI_DOE_READ		0x14    /* DOE Read Data Mailbox Register */
+
+/* DOE Data Object - note not actually registers */
+#define PCI_DOE_DATA_OBJECT_HEADER_1_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_HEADER_1_TYPE		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH		0x0003ffff
+
+#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX		0x000000ff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From e70a3263a7eed768d5f947b8f2aff8d2a79c9d97 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:48 +0900
Subject: can: error: specify the values of data[5..7] of CAN error frames

Currently, data[5..7] of struct can_frame, when used as a CAN error
frame, are defined as being "controller specific". Device specific
behaviours are problematic because it prevents someone from writing
code which is portable between devices.

As a matter of fact, data[5] is never used, data[6] is always used to
report TX error counter and data[7] is always used to report RX error
counter. can-utils also relies on this.

This patch updates the comment in the uapi header to specify that
data[5] is reserved (and thus should not be used) and that data[6..7]
are used for error counters.

Fixes: 0d66548a10cb ("[CAN]: Add PF_CAN core module")
Link: https://lore.kernel.org/all/20220719143550.3681-11-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/error.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index 34633283de64..a1000cb63063 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -120,6 +120,9 @@
 #define CAN_ERR_TRX_CANL_SHORT_TO_GND  0x70 /* 0111 0000 */
 #define CAN_ERR_TRX_CANL_SHORT_TO_CANH 0x80 /* 1000 0000 */
 
-/* controller specific additional information / data[5..7] */
+/* data[5] is reserved (do not use) */
+
+/* TX error counter / data[6] */
+/* RX error counter / data[7] */
 
 #endif /* _UAPI_CAN_ERROR_H */
-- 
cgit v1.2.3


From 3e5c291c7942d0909a48bc5ec1b9bba136465166 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:49 +0900
Subject: can: add CAN_ERR_CNT flag to notify availability of error counter

Add a dedicated flag in uapi/linux/can/error.h to notify the userland
that fields data[6] and data[7] of the CAN error frame were
respectively populated with the tx and rx error counters.

For all driver tree-wide, set up this flags whenever needed.

Link: https://lore.kernel.org/all/20220719143550.3681-12-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/c_can/c_can_main.c                | 6 +++---
 drivers/net/can/cc770/cc770.c                     | 1 +
 drivers/net/can/ctucanfd/ctucanfd_base.c          | 5 +++--
 drivers/net/can/grcan.c                           | 1 +
 drivers/net/can/ifi_canfd/ifi_canfd.c             | 4 ++--
 drivers/net/can/janz-ican3.c                      | 4 ++--
 drivers/net/can/kvaser_pciefd.c                   | 2 +-
 drivers/net/can/m_can/m_can.c                     | 4 ++--
 drivers/net/can/pch_can.c                         | 1 +
 drivers/net/can/peak_canfd/peak_canfd.c           | 6 +++---
 drivers/net/can/rcar/rcar_can.c                   | 1 +
 drivers/net/can/rcar/rcar_canfd.c                 | 4 ++--
 drivers/net/can/sja1000/sja1000.c                 | 1 +
 drivers/net/can/slcan/slcan-core.c                | 1 +
 drivers/net/can/spi/hi311x.c                      | 1 +
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c    | 1 +
 drivers/net/can/sun4i_can.c                       | 1 +
 drivers/net/can/ti_hecc.c                         | 1 +
 drivers/net/can/usb/esd_usb.c                     | 3 ++-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 ++
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 1 +
 drivers/net/can/usb/peak_usb/pcan_usb.c           | 1 +
 drivers/net/can/usb/usb_8dev.c                    | 1 +
 drivers/net/can/xilinx_can.c                      | 1 +
 include/uapi/linux/can/error.h                    | 2 ++
 25 files changed, 38 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/can/c_can/c_can_main.c b/drivers/net/can/c_can/c_can_main.c
index ed4db4cf8716..de38d8f7b5f7 100644
--- a/drivers/net/can/c_can/c_can_main.c
+++ b/drivers/net/can/c_can/c_can_main.c
@@ -952,14 +952,14 @@ static int c_can_handle_state_change(struct net_device *dev,
 
 	switch (error_type) {
 	case C_CAN_NO_ERROR:
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = CAN_ERR_CRTL_ACTIVE;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 		break;
 	case C_CAN_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -969,7 +969,7 @@ static int c_can_handle_state_change(struct net_device *dev,
 		break;
 	case C_CAN_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		if (rx_err_passive)
 			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
 		if (bec.txerr > 127)
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index bb7224cfc6ab..797a954bb1a0 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -512,6 +512,7 @@ static int cc770_err(struct net_device *dev, u8 status)
 
 	/* Use extended functions of the CC770 */
 	if (priv->control_normal_mode & CTRL_EAF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = cc770_read_reg(priv, tx_error_counter);
 		cf->data[7] = cc770_read_reg(priv, rx_error_counter);
 	}
diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c
index 14ac7c0ee04c..6b281f6eb9b4 100644
--- a/drivers/net/can/ctucanfd/ctucanfd_base.c
+++ b/drivers/net/can/ctucanfd/ctucanfd_base.c
@@ -847,7 +847,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 		case CAN_STATE_ERROR_PASSIVE:
 			priv->can.can_stats.error_passive++;
 			if (skb) {
-				cf->can_id |= CAN_ERR_CRTL;
+				cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 				cf->data[1] = (bec.rxerr > 127) ?
 						CAN_ERR_CRTL_RX_PASSIVE :
 						CAN_ERR_CRTL_TX_PASSIVE;
@@ -858,7 +858,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 		case CAN_STATE_ERROR_WARNING:
 			priv->can.can_stats.error_warning++;
 			if (skb) {
-				cf->can_id |= CAN_ERR_CRTL;
+				cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 				cf->data[1] |= (bec.txerr > bec.rxerr) ?
 					CAN_ERR_CRTL_TX_WARNING :
 					CAN_ERR_CRTL_RX_WARNING;
@@ -867,6 +867,7 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr)
 			}
 			break;
 		case CAN_STATE_ERROR_ACTIVE:
+			cf->can_id |= CAN_ERR_CNT;
 			cf->data[1] = CAN_ERR_CRTL_ACTIVE;
 			cf->data[6] = bec.txerr;
 			cf->data[7] = bec.rxerr;
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index 4c47c1055eff..24035a6187c9 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -671,6 +671,7 @@ static void grcan_err(struct net_device *dev, u32 sources, u32 status)
 				/* There are no others at this point */
 				break;
 			}
+			cf.can_id |= CAN_ERR_CNT;
 			cf.data[6] = txerr;
 			cf.data[7] = rxerr;
 			priv->can.state = state;
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 968ed6d7316b..64e3be8b73af 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -492,7 +492,7 @@ static int ifi_canfd_handle_state_change(struct net_device *ndev,
 	switch (new_state) {
 	case CAN_STATE_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -501,7 +501,7 @@ static int ifi_canfd_handle_state_change(struct net_device *ndev,
 		break;
 	case CAN_STATE_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
 		if (bec.txerr > 127)
 			cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index 35bfb82d6929..ccb5c5405224 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -1127,7 +1127,7 @@ static int ican3_handle_cevtind(struct ican3_dev *mod, struct ican3_msg *msg)
 	/* bus error interrupt */
 	if (isrc == CEVTIND_BEI) {
 		mod->can.can_stats.bus_error++;
-		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR | CAN_ERR_CNT;
 
 		switch (ecc & ECC_MASK) {
 		case ECC_BIT:
@@ -1153,7 +1153,7 @@ static int ican3_handle_cevtind(struct ican3_dev *mod, struct ican3_msg *msg)
 
 	if (state != mod->can.state && (state == CAN_STATE_ERROR_WARNING ||
 					state == CAN_STATE_ERROR_PASSIVE)) {
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		if (state == CAN_STATE_ERROR_WARNING) {
 			mod->can.can_stats.error_warning++;
 			cf->data[1] = (txerr > rxerr) ?
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 017f2d36ffc3..dcd2c9d50d5e 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -1306,7 +1306,7 @@ static int kvaser_pciefd_rx_error_frame(struct kvaser_pciefd_can *can,
 	shhwtstamps->hwtstamp =
 		ns_to_ktime(div_u64(p->timestamp * 1000,
 				    can->kv_pcie->freq_to_ticks_div));
-	cf->can_id |= CAN_ERR_BUSERROR;
+	cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_CNT;
 
 	cf->data[6] = bec.txerr;
 	cf->data[7] = bec.rxerr;
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index afaaeb610c00..713a4b0edf86 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -741,7 +741,7 @@ static int m_can_handle_state_change(struct net_device *dev,
 	switch (new_state) {
 	case CAN_STATE_ERROR_WARNING:
 		/* error warning state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = (bec.txerr > bec.rxerr) ?
 			CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
@@ -750,7 +750,7 @@ static int m_can_handle_state_change(struct net_device *dev,
 		break;
 	case CAN_STATE_ERROR_PASSIVE:
 		/* error passive state */
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		ecr = m_can_read(cdev, M_CAN_ECR);
 		if (ecr & ECR_RP)
 			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index 497ef77340ea..50f6719b3aa4 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -497,6 +497,7 @@ static void pch_can_error(struct net_device *ndev, u32 status)
 		priv->can.can_stats.bus_off++;
 		can_bus_off(ndev);
 	} else {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = errc & PCH_TEC;
 		cf->data[7] = (errc & PCH_REC) >> 8;
 	}
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index b2dea360813d..afb9adb3d5c2 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -373,7 +373,7 @@ static int pucan_handle_status(struct peak_canfd_priv *priv,
 		priv->can.state = CAN_STATE_ERROR_PASSIVE;
 		priv->can.can_stats.error_passive++;
 		if (skb) {
-			cf->can_id |= CAN_ERR_CRTL;
+			cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
 					CAN_ERR_CRTL_TX_PASSIVE :
 					CAN_ERR_CRTL_RX_PASSIVE;
@@ -386,7 +386,7 @@ static int pucan_handle_status(struct peak_canfd_priv *priv,
 		priv->can.state = CAN_STATE_ERROR_WARNING;
 		priv->can.can_stats.error_warning++;
 		if (skb) {
-			cf->can_id |= CAN_ERR_CRTL;
+			cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
 					CAN_ERR_CRTL_TX_WARNING :
 					CAN_ERR_CRTL_RX_WARNING;
@@ -430,7 +430,7 @@ static int pucan_handle_cache_critical(struct peak_canfd_priv *priv)
 		return -ENOMEM;
 	}
 
-	cf->can_id |= CAN_ERR_CRTL;
+	cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
 
 	cf->data[6] = priv->bec.txerr;
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 24d7a71def6a..d11db2112a4a 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -334,6 +334,7 @@ static void rcar_can_error(struct net_device *ndev)
 		if (skb)
 			cf->can_id |= CAN_ERR_BUSOFF;
 	} else if (skb) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index ba42cef10a53..e3382284e172 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1052,7 +1052,7 @@ static void rcar_canfd_error(struct net_device *ndev, u32 cerfl,
 		netdev_dbg(ndev, "Error warning interrupt\n");
 		priv->can.state = CAN_STATE_ERROR_WARNING;
 		priv->can.can_stats.error_warning++;
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_WARNING :
 			CAN_ERR_CRTL_RX_WARNING;
 		cf->data[6] = txerr;
@@ -1062,7 +1062,7 @@ static void rcar_canfd_error(struct net_device *ndev, u32 cerfl,
 		netdev_dbg(ndev, "Error passive interrupt\n");
 		priv->can.state = CAN_STATE_ERROR_PASSIVE;
 		priv->can.can_stats.error_passive++;
-		cf->can_id |= CAN_ERR_CRTL;
+		cf->can_id |= CAN_ERR_CRTL | CAN_ERR_CNT;
 		cf->data[1] = txerr > rxerr ? CAN_ERR_CRTL_TX_PASSIVE :
 			CAN_ERR_CRTL_RX_PASSIVE;
 		cf->data[6] = txerr;
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 74bff5092b47..75a2f9bf8c16 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -426,6 +426,7 @@ static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status)
 			state = CAN_STATE_ERROR_ACTIVE;
 	}
 	if (state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c
index dfd1baba4130..dc28e715bbe1 100644
--- a/drivers/net/can/slcan/slcan-core.c
+++ b/drivers/net/can/slcan/slcan-core.c
@@ -314,6 +314,7 @@ static void slc_bump_state(struct slcan *sl)
 	if (state == CAN_STATE_BUS_OFF) {
 		can_bus_off(dev);
 	} else if (skb) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index bfb7c4bb5bc3..167114aae6dd 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -680,6 +680,7 @@ static irqreturn_t hi3110_can_ist(int irq, void *dev_id)
 					break;
 				}
 			} else {
+				cf->can_id |= CAN_ERR_CNT;
 				cf->data[6] = txerr;
 				cf->data[7] = rxerr;
 			}
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 9b47b07162fe..f4e174cadd4e 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1099,6 +1099,7 @@ static int mcp251xfd_handle_cerrif(struct mcp251xfd_priv *priv)
 		err = mcp251xfd_get_berr_counter(priv->ndev, &bec);
 		if (err)
 			return err;
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 	}
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index afe9b541f037..b90dfb429ccd 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -566,6 +566,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status)
 			state = CAN_STATE_ERROR_ACTIVE;
 	}
 	if (skb && state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index debe17bfd0f0..afa38771520e 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -662,6 +662,7 @@ static void ti_hecc_change_state(struct net_device *ndev,
 	can_change_state(priv->ndev, cf, tx_state, rx_state);
 
 	if (max(tx_state, rx_state) != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = hecc_read(priv, HECC_CANTEC);
 		cf->data[7] = hecc_read(priv, HECC_CANREC);
 	}
diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c
index 8a4bf2961f3d..177ed33e08d9 100644
--- a/drivers/net/can/usb/esd_usb.c
+++ b/drivers/net/can/usb/esd_usb.c
@@ -265,7 +265,8 @@ static void esd_usb_rx_event(struct esd_usb_net_priv *priv,
 			priv->can.can_stats.bus_error++;
 			stats->rx_errors++;
 
-			cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+			cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR |
+				      CAN_ERR_CNT;
 
 			switch (ecc & SJA1000_ECC_MASK) {
 			case SJA1000_ECC_BIT:
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 404093468b2f..dd65c101bfb8 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -918,6 +918,7 @@ static void kvaser_usb_hydra_update_state(struct kvaser_usb_net_priv *priv,
 		priv->can.can_stats.restarts++;
 
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec->txerr;
 		cf->data[7] = bec->rxerr;
 	}
@@ -1072,6 +1073,7 @@ kvaser_usb_hydra_error_frame(struct kvaser_usb_net_priv *priv,
 
 	cf->can_id |= CAN_ERR_BUSERROR;
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = bec.txerr;
 		cf->data[7] = bec.rxerr;
 	}
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index f551fde16a70..07f687f29b34 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -854,6 +854,7 @@ static void kvaser_usb_leaf_rx_error(const struct kvaser_usb *dev,
 	}
 
 	if (new_state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = es->txerr;
 		cf->data[7] = es->rxerr;
 	}
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 091c631ebe23..d07b7ee79e3e 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -506,6 +506,7 @@ static int pcan_usb_decode_error(struct pcan_usb_msg_context *mc, u8 n,
 			/* Supply TX/RX error counters in case of
 			 * controller error.
 			 */
+			cf->can_id = CAN_ERR_CNT;
 			cf->data[6] = mc->pdev->bec.txerr;
 			cf->data[7] = mc->pdev->bec.rxerr;
 		}
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 4d38dc90472a..8b7cd69e20b0 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -439,6 +439,7 @@ static void usb_8dev_rx_err_msg(struct usb_8dev_priv *priv,
 	if (rx_errors)
 		stats->rx_errors++;
 	if (priv->can.state != CAN_STATE_BUS_OFF) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 0de2f97d9f62..caa6b4cee63f 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -965,6 +965,7 @@ static void xcan_set_error_state(struct net_device *ndev,
 	can_change_state(ndev, cf, tx_state, rx_state);
 
 	if (cf) {
+		cf->can_id |= CAN_ERR_CNT;
 		cf->data[6] = txerr;
 		cf->data[7] = rxerr;
 	}
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index a1000cb63063..b7c3efd9ff99 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -57,6 +57,8 @@
 #define CAN_ERR_BUSOFF       0x00000040U /* bus off */
 #define CAN_ERR_BUSERROR     0x00000080U /* bus error (may flood!) */
 #define CAN_ERR_RESTARTED    0x00000100U /* controller restarted */
+#define CAN_ERR_CNT          0x00000200U /* TX error counter / data[6] */
+					 /* RX error counter / data[7] */
 
 /* arbitration lost in bit ... / data[0] */
 #define CAN_ERR_LOSTARB_UNSPEC   0x00 /* unspecified */
-- 
cgit v1.2.3


From 3f9c26210cf80ea8cb5dd901aba5feb77200b085 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 19 Jul 2022 23:35:50 +0900
Subject: can: error: add definitions for the different CAN error thresholds

Currently, drivers are using magic numbers to derive the CAN error
states from the error counter. Add three macro declarations to
remediate this.

For reference, the error-active, error-passive and bus-off are defined
in ISO 11898, section 12.1.4.2 "Error counting". Although ISO 11898
does not define error-warning state, this extra value is also commonly
used and is thus also added.

Link: https://lore.kernel.org/all/20220719143550.3681-13-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/error.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index b7c3efd9ff99..acc1ac393d2a 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -127,4 +127,17 @@
 /* TX error counter / data[6] */
 /* RX error counter / data[7] */
 
+/* CAN state thresholds
+ *
+ * Error counter	Error state
+ * -----------------------------------
+ * 0 -  95		Error-active
+ * 96 - 127		Error-warning
+ * 128 - 255		Error-passive
+ * 256 and greater	Bus-off
+ */
+#define CAN_ERROR_WARNING_THRESHOLD 96
+#define CAN_ERROR_PASSIVE_THRESHOLD 128
+#define CAN_BUS_OFF_THRESHOLD 256
+
 #endif /* _UAPI_CAN_ERROR_H */
-- 
cgit v1.2.3


From f5ecfee94493475783074e86ded10a0499d779fc Mon Sep 17 00:00:00 2001
From: Pierre Morel <pmorel@linux.ibm.com>
Date: Thu, 14 Jul 2022 21:43:34 +0200
Subject: KVM: s390: resetting the Topology-Change-Report

During a subsystem reset the Topology-Change-Report is cleared.

Let's give userland the possibility to clear the MTCR in the case
of a subsystem reset.

To migrate the MTCR, we give userland the possibility to
query the MTCR state.

We indicate KVM support for the CPU topology facility with a new
KVM capability: KVM_CAP_S390_CPU_TOPOLOGY.

Signed-off-by: Pierre Morel <pmorel@linux.ibm.com>
Reviewed-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Message-Id: <20220714194334.127812-1-pmorel@linux.ibm.com>
Link: https://lore.kernel.org/all/20220714194334.127812-1-pmorel@linux.ibm.com/
[frankja@linux.ibm.com: Simple conflict resolution in Documentation/virt/kvm/api.rst]
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst   | 25 ++++++++++++++++++++
 arch/s390/include/uapi/asm/kvm.h |  1 +
 arch/s390/kvm/kvm-s390.c         | 51 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h         |  1 +
 4 files changed, 78 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 5be5cc59869d..3c4551a2f6d0 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8269,6 +8269,31 @@ The capability has no effect if the nx_huge_pages module parameter is not set.
 
 This capability may only be set before any vCPUs are created.
 
+8.39 KVM_CAP_S390_CPU_TOPOLOGY
+------------------------------
+
+:Capability: KVM_CAP_S390_CPU_TOPOLOGY
+:Architectures: s390
+:Type: vm
+
+This capability indicates that KVM will provide the S390 CPU Topology
+facility which consist of the interpretation of the PTF instruction for
+the function code 2 along with interception and forwarding of both the
+PTF instruction with function codes 0 or 1 and the STSI(15,1,x)
+instruction to the userland hypervisor.
+
+The stfle facility 11, CPU Topology facility, should not be indicated
+to the guest without this capability.
+
+When this capability is present, KVM provides a new attribute group
+on vm fd, KVM_S390_VM_CPU_TOPOLOGY.
+This new attribute allows to get, set or clear the Modified Change
+Topology Report (MTCR) bit of the SCA through the kvm_device_attr
+structure.
+
+When getting the Modified Change Topology Report value, the attr->addr
+must point to a byte where the value will be stored or retrieved from.
+
 9. Known KVM API problems
 =========================
 
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 7a6b14874d65..a73cf01a1606 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_CRYPTO		2
 #define KVM_S390_VM_CPU_MODEL		3
 #define KVM_S390_VM_MIGRATION		4
+#define KVM_S390_VM_CPU_TOPOLOGY	5
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA	0
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5d18b66a08c9..edfd4bbd0cba 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -642,6 +642,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_ZPCI_OP:
 		r = kvm_s390_pci_interp_allowed();
 		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = test_facility(11);
+		break;
 	default:
 		r = 0;
 	}
@@ -853,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		icpt_operexc_on_all_vcpus(kvm);
 		r = 0;
 		break;
+	case KVM_CAP_S390_CPU_TOPOLOGY:
+		r = -EINVAL;
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else if (test_facility(11)) {
+			set_kvm_facility(kvm->arch.model.fac_mask, 11);
+			set_kvm_facility(kvm->arch.model.fac_list, 11);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1789,6 +1806,31 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
 	read_unlock(&kvm->arch.sca_lock);
 }
 
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+	return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+					       struct kvm_device_attr *attr)
+{
+	u8 topo;
+
+	if (!test_kvm_facility(kvm, 11))
+		return -ENXIO;
+
+	read_lock(&kvm->arch.sca_lock);
+	topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+	read_unlock(&kvm->arch.sca_lock);
+
+	return put_user(topo, (u8 __user *)attr->addr);
+}
+
 static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret;
@@ -1809,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_set_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_set_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1834,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = kvm_s390_vm_get_migration(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = kvm_s390_get_topo_change_indication(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1907,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_MIGRATION:
 		ret = 0;
 		break;
+	case KVM_S390_VM_CPU_TOPOLOGY:
+		ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+		break;
 	default:
 		ret = -ENXIO;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 20817dd7f2f1..7e06194129e3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1168,6 +1168,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_NOTIFY_VMEXIT 219
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
+#define KVM_CAP_S390_CPU_TOPOLOGY 222
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From d0b55afa47694f6f61b40f578ede7bde1648fe48 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 6 Jul 2022 17:20:52 -0700
Subject: dmaengine: idxd: Correct IAX operation code names

Some IAX operation code nomenclatures are misleading or don't match with
others:

1. Operation code 0x4c is Zero Compress 32. IAX_OPCODE_DECOMP_32 is a
   misleading name. Change it to IAX_OPCODE_ZERO_COMP_32.
2. Operation code 0x4d is Zero Compress 16. IAX_OPCODE_DECOMP_16 is a
   misleading name. Change it to IAX_OPCODE_ZERO_COMP_16.
3. IAX_OPCDE_FIND_UNIQUE is corrected to match with other nomenclatures.

Co-developed-by: Li Zhang <li4.zhang@intel.com>
Signed-off-by: Li Zhang <li4.zhang@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20220707002052.1546361-1-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index bce7c43657d5..095299c75828 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -89,14 +89,14 @@ enum iax_opcode {
 	IAX_OPCODE_CRC64,
 	IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
 	IAX_OPCODE_ZERO_DECOMP_16,
-	IAX_OPCODE_DECOMP_32 = 0x4c,
-	IAX_OPCODE_DECOMP_16,
+	IAX_OPCODE_ZERO_COMP_32 = 0x4c,
+	IAX_OPCODE_ZERO_COMP_16,
 	IAX_OPCODE_SCAN = 0x50,
 	IAX_OPCODE_SET_MEMBER,
 	IAX_OPCODE_EXTRACT,
 	IAX_OPCODE_SELECT,
 	IAX_OPCODE_RLE_BURST,
-	IAX_OPCDE_FIND_UNIQUE,
+	IAX_OPCODE_FIND_UNIQUE,
 	IAX_OPCODE_EXPAND,
 };
 
-- 
cgit v1.2.3


From 5f8bcc837a9640ba4bf5e7b1d7f9b254ea029f47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Jul 2022 15:09:10 +0200
Subject: ublk: remove UBLK_IO_F_PREFLUSH

REQ_PREFLUSH is turned into REQ_OP_FLUSH by the flush state machine
and thus never seen by a blk-mq based driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220721130916.1869719-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 3 ---
 include/uapi/linux/ublk_cmd.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b90481b295a7..07913b5bccd9 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -392,9 +392,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 	if (req->cmd_flags & REQ_FUA)
 		flags |= UBLK_IO_F_FUA;
 
-	if (req->cmd_flags & REQ_PREFLUSH)
-		flags |= UBLK_IO_F_PREFLUSH;
-
 	if (req->cmd_flags & REQ_NOUNMAP)
 		flags |= UBLK_IO_F_NOUNMAP;
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index d6879eea2fde..917580b34198 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -107,7 +107,6 @@ struct ublksrv_ctrl_dev_info {
 #define		UBLK_IO_F_FAILFAST_DRIVER	(1U << 10)
 #define		UBLK_IO_F_META			(1U << 11)
 #define		UBLK_IO_F_FUA			(1U << 13)
-#define		UBLK_IO_F_PREFLUSH		(1U << 14)
 #define		UBLK_IO_F_NOUNMAP		(1U << 15)
 #define		UBLK_IO_F_SWAP			(1U << 16)
 
-- 
cgit v1.2.3


From 9dd1953846c7cd58100a5c6bd90db54e2c60668a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 19 Jul 2022 10:26:50 +0200
Subject: wifi: nl80211/mac80211: clarify link ID in control port TX

Clarify the link ID behaviour in control port TX, we need it
to select the link to transmit on for both MLD and non-MLD
receivers, but select the link address as the SA only if the
receiver is not an MLD.

Fixes: 67207bab9341 ("wifi: cfg80211/mac80211: Support control port TX from specific link")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  6 ++++++
 net/mac80211/tx.c            | 43 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 3fa586e38f88..d4d6ba585b41 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1119,6 +1119,12 @@
  *	has been received. %NL80211_ATTR_FRAME is used to specify the
  *	frame contents.  The frame is the raw EAPoL data, without ethernet or
  *	802.11 headers.
+ *	For an MLD transmitter, the %NL80211_ATTR_MLO_LINK_ID may be given and
+ *	its effect will depend on the destination: If the destination is known
+ *	to be an MLD, this will be used as a hint to select the link to transmit
+ *	the frame on. If the destination is not an MLD, this will select both
+ *	the link to transmit on and the source address will be set to the link
+ *	address of that link.
  *	When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE,
  *	%NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added
  *	indicating the protocol type of the received frame; whether the frame
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 772108c2cc6b..06ec152e8188 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2896,9 +2896,35 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
 	info->flags = info_flags;
 	info->ack_frame_id = info_id;
 	info->band = band;
-	info->control.flags = ctrl_flags |
-			      u32_encode_bits(link_id,
+
+	if (likely(!cookie)) {
+		ctrl_flags |= u32_encode_bits(link_id,
 					      IEEE80211_TX_CTRL_MLO_LINK);
+	} else {
+		unsigned int pre_conf_link_id;
+
+		/*
+		 * ctrl_flags already have been set by
+		 * ieee80211_tx_control_port(), here
+		 * we just sanity check that
+		 */
+
+		pre_conf_link_id = u32_get_bits(ctrl_flags,
+						IEEE80211_TX_CTRL_MLO_LINK);
+
+		if (pre_conf_link_id != link_id &&
+		    link_id != IEEE80211_LINK_UNSPECIFIED) {
+#ifdef CPTCFG_MAC80211_VERBOSE_DEBUG
+			net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n",
+					     sdata->name, hdr.addr1,
+					     pre_conf_link_id, link_id);
+#endif
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	info->control.flags = ctrl_flags;
 
 	return skb;
  free:
@@ -5745,11 +5771,17 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	ehdr = skb_push(skb, sizeof(struct ethhdr));
 	memcpy(ehdr->h_dest, dest, ETH_ALEN);
 
+	/* we may override the SA for MLO STA later */
 	if (link_id < 0) {
+		ctrl_flags |= u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
+					      IEEE80211_TX_CTRL_MLO_LINK);
 		memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
 	} else {
 		struct ieee80211_bss_conf *link_conf;
 
+		ctrl_flags |= u32_encode_bits(link_id,
+					      IEEE80211_TX_CTRL_MLO_LINK);
+
 		rcu_read_lock();
 		link_conf = rcu_dereference(sdata->vif.link_conf[link_id]);
 		if (!link_conf) {
@@ -5784,6 +5816,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 
 		skb_set_queue_mapping(skb, queue);
 		skb_get_hash(skb);
+
+		/*
+		 * for MLO STA, the SA should be the AP MLD address, but
+		 * the link ID has been selected already
+		 */
+		if (sta->sta.mlo)
+			memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 80b0ed70a271d375feb2286696ca8af147a035cf Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 26 Jan 2022 16:06:35 +0200
Subject: wifi: nl80211: add RX and TX timestamp attributes

Add attributes for reporting hardware timestamps for management frames
RX and TX. These attributes will be used for reporting hardware
timestamps for Timing measurement and Fine Timing Measurement action
frames, which will allow userspace applications to measure the path
delay between devices and sync clocks.

For TX, these attributes are used for reporting the frame RX time and
the ack TX time. For TX, they are used for reporting the frame TX time
and the ack RX time.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d4d6ba585b41..5275dcbc5ee8 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -764,6 +764,9 @@
  *	%NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA
  *	counters which will be updated to the current value. This attribute
  *	is used during CSA period.
+ *	For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
+ *	indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may
+ *	be included to indicate the ack TX timestamp.
  * @NL80211_CMD_FRAME_WAIT_CANCEL: When an off-channel TX was requested, this
  *	command may be used with the corresponding cookie to cancel the wait
  *	time if it is known that it is no longer necessary.  This command is
@@ -774,7 +777,9 @@
  *	transmitted with %NL80211_CMD_FRAME. %NL80211_ATTR_COOKIE identifies
  *	the TX command and %NL80211_ATTR_FRAME includes the contents of the
  *	frame. %NL80211_ATTR_ACK flag is included if the recipient acknowledged
- *	the frame.
+ *	the frame. %NL80211_ATTR_TX_HW_TIMESTAMP may be included to indicate the
+ *	tx timestamp and %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
+ *	indicate the ack RX timestamp.
  * @NL80211_CMD_ACTION_TX_STATUS: Alias for @NL80211_CMD_FRAME_TX_STATUS for
  *	backward compatibility.
  *
@@ -2720,6 +2725,18 @@ enum nl80211_commands {
  * @NL80211_ATTR_EML_CAPABILITY: EML Capability information (u16)
  * @NL80211_ATTR_MLD_CAPA_AND_OPS: MLD Capabilities and Operations (u16)
  *
+ * @NL80211_ATTR_TX_HW_TIMESTAMP: Hardware timestamp for TX operation in
+ *	nanoseconds (u64). This is the device clock timestamp so it will
+ *	probably reset when the device is stopped or the firmware is reset.
+ *	When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the frame TX
+ *	timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates
+ *	the ack TX timestamp.
+ * @NL80211_ATTR_RX_HW_TIMESTAMP: Hardware timestamp for RX operation in
+ *	nanoseconds (u64). This is the device clock timestamp so it will
+ *	probably reset when the device is stopped or the firmware is reset.
+ *	When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX
+ *	timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates
+ *	the incoming frame RX timestamp.
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3245,6 +3262,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_EML_CAPABILITY,
 	NL80211_ATTR_MLD_CAPA_AND_OPS,
 
+	NL80211_ATTR_TX_HW_TIMESTAMP,
+	NL80211_ATTR_RX_HW_TIMESTAMP,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit v1.2.3


From 95f498bb49f7030c1f40236107e5241e50f79ade Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Jul 2022 12:13:46 +0200
Subject: wifi: nl80211: add MLO link ID to the NL80211_CMD_FRAME TX API

Allow optionally specifying the link ID to transmit on,
which can be done instead of the link frequency, on an
MLD addressed frame. Both can also be omitted in which
case the frame must be MLD addressed and link selection
(and address translation) will be done on lower layers.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/nl80211.c       | 12 ++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8545ed098d90..908d58393484 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3411,6 +3411,9 @@ struct cfg80211_update_ft_ies_params {
  * @dont_wait_for_ack: tells the low level not to wait for an ack
  * @n_csa_offsets: length of csa_offsets array
  * @csa_offsets: array of all the csa offsets in the frame
+ * @link_id: for MLO, the link ID to transmit on, -1 if not given; note
+ *	that the link ID isn't validated (much), it's in range but the
+ *	link might not exist (or be used by the receiver STA)
  */
 struct cfg80211_mgmt_tx_params {
 	struct ieee80211_channel *chan;
@@ -3422,6 +3425,7 @@ struct cfg80211_mgmt_tx_params {
 	bool dont_wait_for_ack;
 	int n_csa_offsets;
 	const u16 *csa_offsets;
+	int link_id;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5275dcbc5ee8..ffb7c573e299 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -764,6 +764,10 @@
  *	%NL80211_ATTR_CSA_C_OFFSETS_TX is an array of offsets to CSA
  *	counters which will be updated to the current value. This attribute
  *	is used during CSA period.
+ *	For TX on an MLD, the frequency can be omitted and the link ID be
+ *	specified, or if transmitting to a known peer MLD (with MLD addresses
+ *	in the frame) both can be omitted and the link will be selected by
+ *	lower layers.
  *	For RX notification, %NL80211_ATTR_RX_HW_TIMESTAMP may be included to
  *	indicate the frame RX timestamp and %NL80211_ATTR_TX_HW_TIMESTAMP may
  *	be included to indicate the ack TX timestamp.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 60b8406b8d7e..2705e3ee8fc4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12256,6 +12256,18 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 		wdev_unlock(wdev);
 		return -EBUSY;
 	}
+
+	params.link_id = nl80211_link_id_or_invalid(info->attrs);
+	/*
+	 * This now races due to the unlock, but we cannot check
+	 * the valid links for the _station_ anyway, so that's up
+	 * to the driver.
+	 */
+	if (params.link_id >= 0 &&
+	    !(wdev->valid_links & BIT(params.link_id))) {
+		wdev_unlock(wdev);
+		return -EINVAL;
+	}
 	wdev_unlock(wdev);
 
 	params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
-- 
cgit v1.2.3


From 6d8c5afc9ab14595707ff25d971dde45728eba3e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jul 2022 18:38:17 +0800
Subject: ublk_drv: make sure that correct flags(features) returned to
 userspace

Userspace may support more features or new added flags, but the driver
side can be old, so make sure correct flags(features) returned to
userpsace, then userspace can work as expected.

Also mark the 2nd flags as reversed, just use the 1st one. When we run
out of flags, the reserved one can be handled at that time.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220722103817.631258-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 17 ++++++++++++++---
 include/uapi/linux/ublk_cmd.h |  7 ++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 67f91a80a7ab..255b2de46a24 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -46,6 +46,9 @@
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
+/* All UBLK_F_* have to be included into UBLK_F_ALL */
+#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
+
 struct ublk_rq_data {
 	struct callback_head work;
 };
@@ -953,7 +956,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	void *ptr;
 	int size;
 
-	ubq->flags = ub->dev_info.flags[0];
+	ubq->flags = ub->dev_info.flags;
 	ubq->q_id = q_id;
 	ubq->q_depth = ub->dev_info.queue_depth;
 	size = ublk_queue_cmd_buf_size(ub, q_id);
@@ -1246,7 +1249,7 @@ out_put_device:
 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 {
 	pr_devel("%s: dev id %d flags %llx\n", __func__,
-			info->dev_id, info->flags[0]);
+			info->dev_id, info->flags);
 	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
 			info->nr_hw_queues, info->queue_depth,
 			info->block_size, info->dev_blocks);
@@ -1298,8 +1301,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	/* update device id */
 	ub->dev_info.dev_id = ub->ub_number;
 
+	/*
+	 * 64bit flags will be copied back to userspace as feature
+	 * negotiation result, so have to clear flags which driver
+	 * doesn't support yet, then userspace can get correct flags
+	 * (features) to handle.
+	 */
+	ub->dev_info.flags &= UBLK_F_ALL;
+
 	/* We are not ready to support zero copy */
-	ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
+	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
 	ub->bs_shift = ilog2(ub->dev_info.block_size);
 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 917580b34198..ca33092354ab 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -46,13 +46,13 @@
  * zero copy requires 4k block size, and can remap ublk driver's io
  * request into ublksrv's vm space
  */
-#define UBLK_F_SUPPORT_ZERO_COPY	(1UL << 0)
+#define UBLK_F_SUPPORT_ZERO_COPY	(1ULL << 0)
 
 /*
  * Force to complete io cmd via io_uring_cmd_complete_in_task so that
  * performance comparison is done easily with using task_work_add
  */
-#define UBLK_F_URING_CMD_COMP_IN_TASK	(1UL << 1)
+#define UBLK_F_URING_CMD_COMP_IN_TASK	(1ULL << 1)
 
 /* device state */
 #define UBLK_S_DEV_DEAD	0
@@ -88,7 +88,8 @@ struct ublksrv_ctrl_dev_info {
 
 	__s32	ublksrv_pid;
 	__s32	reserved0;
-	__u64	flags[2];
+	__u64	flags;
+	__u64	flags_reserved;
 
 	/* For ublksrv internal use, invisible to ublk driver */
 	__u64	ublksrv_flags;
-- 
cgit v1.2.3


From b9ba8a4463cd78d0aee520c4bf2569820ac29929 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 May 2022 10:55:07 -0600
Subject: io_uring: add support for level triggered poll

By default, the POLL_ADD command does edge triggered poll - if we get
a non-zero mask on the initial poll attempt, we complete the request
successfully.

Support level triggered by always waiting for a notification, regardless
of whether or not the initial mask matches the file state.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/poll.c               | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ad3da28d2fc..4927bb69387a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -229,10 +229,13 @@ enum io_uring_op {
  *
  * IORING_POLL_UPDATE		Update existing poll request, matching
  *				sqe->addr as the old user_data field.
+ *
+ * IORING_POLL_LEVEL		Level triggered poll.
  */
 #define IORING_POLL_ADD_MULTI	(1U << 0)
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
+#define IORING_POLL_ADD_LEVEL		(1U << 3)
 
 /*
  * ASYNC_CANCEL flags.
diff --git a/io_uring/poll.c b/io_uring/poll.c
index b80f7fa26123..558dc170468a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -423,11 +423,13 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	atomic_set(&req->poll_refs, 1);
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 
-	if (mask && (poll->events & EPOLLONESHOT)) {
+	if (mask &&
+	   ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
 		io_poll_remove_entries(req);
 		/* no one else has access to the req, forget about the ref */
 		return mask;
 	}
+
 	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
 		io_poll_remove_entries(req);
 		if (!ipt->error)
@@ -439,7 +441,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	io_poll_req_insert(req);
 	spin_unlock(&ctx->completion_lock);
 
-	if (mask) {
+	if (mask && (poll->events & EPOLLET)) {
 		/* can't multishot if failed, just queue the event we've got */
 		if (unlikely(ipt->error || !ipt->nr_entries)) {
 			poll->events |= EPOLLONESHOT;
@@ -475,7 +477,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
-	__poll_t mask = POLLPRI | POLLERR;
+	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
 	int ret;
 
 	if (!def->pollin && !def->pollout)
@@ -638,7 +640,10 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 #endif
 	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
-	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+	if (!(flags & IORING_POLL_ADD_LEVEL))
+		events |= EPOLLET;
+	return demangle_poll(events) |
+		(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
 }
 
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -679,7 +684,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
 		return -EINVAL;
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;
-- 
cgit v1.2.3


From 97bbdc06a4446bc69d8ba71d722abae542a6b70c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jun 2022 10:22:08 +0100
Subject: io_uring: add IORING_SETUP_SINGLE_ISSUER

Add a new IORING_SETUP_SINGLE_ISSUER flag and the userspace visible part
of it, i.e. put limitations of submitters. Also, don't allow it together
with IOPOLL as we're not going to put it to good use.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4bcc41ee467fdf04c8aab8baf6ce3ba21858c3d4.1655371007.git.asml.silence@gmail.com
Reviewed-by: Hao Xu <howeyxu@tencent.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 ++++-
 io_uring/io_uring.c           |  7 +++++--
 io_uring/io_uring_types.h     |  1 +
 io_uring/tctx.c               | 27 ++++++++++++++++++++++++---
 io_uring/tctx.h               |  4 ++--
 5 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4927bb69387a..d7ae81b10893 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -140,9 +140,12 @@ enum {
  * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
  */
 #define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
-
 #define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
 #define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER	(1U << 12)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 85a479594b05..06772139b7da 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2457,6 +2457,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
+	if (ctx->submitter_task)
+		put_task_struct(ctx->submitter_task);
 
 	/* there are no registered resources left, nobody uses it */
 	if (ctx->rsrc_node)
@@ -3189,7 +3191,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
@@ -3409,7 +3411,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
-			IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
+			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
+			IORING_SETUP_SINGLE_ISSUER))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
index 1f8db2dd7af7..8b00243abf65 100644
--- a/io_uring/io_uring_types.h
+++ b/io_uring/io_uring_types.h
@@ -243,6 +243,7 @@ struct io_ring_ctx {
 	/* Keep this last, we don't need it for the fast path */
 
 	struct io_restriction		restrictions;
+	struct task_struct		*submitter_task;
 
 	/* slow path rsrc auxilary data, used by update/register */
 	struct io_rsrc_node		*rsrc_backup_node;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 5a5d4f908529..a819da8fc85c 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -94,12 +94,32 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	return 0;
 }
 
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static int io_register_submitter(struct io_ring_ctx *ctx)
+{
+	int ret = 0;
+
+	mutex_lock(&ctx->uring_lock);
+	if (!ctx->submitter_task)
+		ctx->submitter_task = get_task_struct(current);
+	else if (ctx->submitter_task != current)
+		ret = -EEXIST;
+	mutex_unlock(&ctx->uring_lock);
+
+	return ret;
+}
+
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
 	int ret;
 
+	if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) {
+		ret = io_register_submitter(ctx);
+		if (ret)
+			return ret;
+	}
+
 	if (unlikely(!tctx)) {
 		ret = io_uring_alloc_task_context(current, ctx);
 		if (unlikely(ret))
@@ -133,7 +153,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 		list_add(&node->ctx_node, &ctx->tctx_list);
 		mutex_unlock(&ctx->uring_lock);
 	}
-	tctx->last = ctx;
+	if (submitter)
+		tctx->last = ctx;
 	return 0;
 }
 
@@ -241,7 +262,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
 		return -EINVAL;
 
 	mutex_unlock(&ctx->uring_lock);
-	ret = io_uring_add_tctx_node(ctx);
+	ret = __io_uring_add_tctx_node(ctx, false);
 	mutex_lock(&ctx->uring_lock);
 	if (ret)
 		return ret;
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index 7684713e950f..dde82ce4d8e2 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -34,7 +34,7 @@ struct io_tctx_node {
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 void io_uring_del_tctx_node(unsigned long index);
-int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
+int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter);
 void io_uring_clean_tctx(struct io_uring_task *tctx);
 
 void io_uring_unreg_ringfd(void);
@@ -52,5 +52,5 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_tctx_node(ctx);
+	return __io_uring_add_tctx_node(ctx, true);
 }
-- 
cgit v1.2.3


From 7d8ca7250197096bfa9f432c1d99b0555504bbba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 09:47:04 -0600
Subject: io_uring: add IORING_ASYNC_CANCEL_FD_FIXED cancel flag

In preparation for not having a request to pass in that carries this
state, add a separate cancelation flag that allows the caller to ask
for a fixed file for cancelation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 ++
 io_uring/cancel.c             | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d7ae81b10893..a09a78bd7556 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -247,10 +247,12 @@ enum io_uring_op {
  * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
  *				request 'user_data'
  * IORING_ASYNC_CANCEL_ANY	Match any request
+ * IORING_ASYNC_CANCEL_FD_FIXED	'fd' passed in is a fixed descriptor
  */
 #define IORING_ASYNC_CANCEL_ALL	(1U << 0)
 #define IORING_ASYNC_CANCEL_FD	(1U << 1)
 #define IORING_ASYNC_CANCEL_ANY	(1U << 2)
+#define IORING_ASYNC_CANCEL_FD_FIXED	(1U << 3)
 
 /*
  * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 500ee5f5fd23..da486de07029 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -24,7 +24,7 @@ struct io_cancel {
 };
 
 #define CANCEL_FLAGS	(IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
-			 IORING_ASYNC_CANCEL_ANY)
+			 IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
 
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
@@ -174,11 +174,14 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
-		if (req->flags & REQ_F_FIXED_FILE)
+		if (req->flags & REQ_F_FIXED_FILE ||
+		    cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
+			req->flags |= REQ_F_FIXED_FILE;
 			req->file = io_file_get_fixed(req, cancel->fd,
 							issue_flags);
-		else
+		} else {
 			req->file = io_file_get_normal(req, cancel->fd);
+		}
 		if (!req->file) {
 			ret = -EBADF;
 			goto done;
-- 
cgit v1.2.3


From 78a861b9495920f8609dee5b670dacbff09d359f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 18 Jun 2022 10:00:50 -0600
Subject: io_uring: add sync cancelation API through io_uring_register()

The io_uring cancelation API is async, like any other API that we expose
there. For the case of finding a request to cancel, or not finding one,
it is fully sync in that when submission returns, the CQE for both the
cancelation request and the targeted request have been posted to the
CQ ring.

However, if the targeted work is being executed by io-wq, the API can
only start the act of canceling it. This makes it difficult to use in
some circumstances, as the caller then has to wait for the CQEs to come
in and match on the same cancelation data there.

Provide a IORING_REGISTER_SYNC_CANCEL command for io_uring_register()
that does sync cancelations, always. For the io-wq case, it'll wait
for the cancelation to come in before returning. The only expected
returns from this API is:

0		Request found and canceled fine.
> 0		Requests found and canceled. Only happens if asked to
		cancel multiple requests, and if the work wasn't in
		progress.
-ENOENT		Request not found.
-ETIME		A timeout on the operation was requested, but the timeout
		expired before we could cancel.

and we won't get -EALREADY via this API.

If the timeout value passed in is -1 (tv_sec and tv_nsec), then that
means that no timeout is requested. Otherwise, the timespec passed in
is the amount of time the sync cancel will wait for a successful
cancelation.

Link: https://github.com/axboe/liburing/discussions/608
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  15 ++++++
 io_uring/cancel.c             | 107 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/cancel.h             |   2 +
 io_uring/io_uring.c           |   6 +++
 4 files changed, 130 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a09a78bd7556..094f706c93e0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -10,6 +10,7 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
+#include <linux/time_types.h>
 
 /*
  * IO submission data structure (Submission Queue Entry)
@@ -428,6 +429,9 @@ enum {
 	IORING_REGISTER_PBUF_RING		= 22,
 	IORING_UNREGISTER_PBUF_RING		= 23,
 
+	/* sync cancelation API */
+	IORING_REGISTER_SYNC_CANCEL		= 24,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -563,4 +567,15 @@ struct io_uring_getevents_arg {
 	__u64	ts;
 };
 
+/*
+ * Argument for IORING_REGISTER_SYNC_CANCEL
+ */
+struct io_uring_sync_cancel_reg {
+	__u64				addr;
+	__s32				fd;
+	__u32				flags;
+	struct __kernel_timespec	timeout;
+	__u64				pad[4];
+};
+
 #endif
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index da486de07029..8435a1eba59a 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -206,3 +207,109 @@ void init_hash_table(struct io_hash_table *table, unsigned size)
 		INIT_HLIST_HEAD(&table->hbs[i].list);
 	}
 }
+
+static int __io_sync_cancel(struct io_uring_task *tctx,
+			    struct io_cancel_data *cd, int fd)
+{
+	struct io_ring_ctx *ctx = cd->ctx;
+
+	/* fixed must be grabbed every time since we drop the uring_lock */
+	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
+	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		unsigned long file_ptr;
+
+		if (unlikely(fd > ctx->nr_user_files))
+			return -EBADF;
+		fd = array_index_nospec(fd, ctx->nr_user_files);
+		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+		cd->file = (struct file *) (file_ptr & FFS_MASK);
+		if (!cd->file)
+			return -EBADF;
+	}
+
+	return __io_async_cancel(cd, tctx, 0);
+}
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_cancel_data cd = {
+		.ctx	= ctx,
+		.seq	= atomic_inc_return(&ctx->cancel_seq),
+	};
+	ktime_t timeout = KTIME_MAX;
+	struct io_uring_sync_cancel_reg sc;
+	struct fd f = { };
+	DEFINE_WAIT(wait);
+	int ret;
+
+	if (copy_from_user(&sc, arg, sizeof(sc)))
+		return -EFAULT;
+	if (sc.flags & ~CANCEL_FLAGS)
+		return -EINVAL;
+	if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
+		return -EINVAL;
+
+	cd.data = sc.addr;
+	cd.flags = sc.flags;
+
+	/* we can grab a normal file descriptor upfront */
+	if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
+	   !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
+		f = fdget(sc.fd);
+		if (!f.file)
+			return -EBADF;
+		cd.file = f.file;
+	}
+
+	ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+	/* found something, done! */
+	if (ret != -EALREADY)
+		goto out;
+
+	if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
+		struct timespec64 ts = {
+			.tv_sec		= sc.timeout.tv_sec,
+			.tv_nsec	= sc.timeout.tv_nsec
+		};
+
+		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+	}
+
+	/*
+	 * Keep looking until we get -ENOENT. we'll get woken everytime
+	 * every time a request completes and will retry the cancelation.
+	 */
+	do {
+		cd.seq = atomic_inc_return(&ctx->cancel_seq);
+
+		prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
+
+		ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
+
+		if (ret != -EALREADY)
+			break;
+
+		mutex_unlock(&ctx->uring_lock);
+		ret = io_run_task_work_sig();
+		if (ret < 0) {
+			mutex_lock(&ctx->uring_lock);
+			break;
+		}
+		ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
+		mutex_lock(&ctx->uring_lock);
+		if (!ret) {
+			ret = -ETIME;
+			break;
+		}
+	} while (1);
+
+	finish_wait(&ctx->cq_wait, &wait);
+
+	if (ret == -ENOENT || ret > 0)
+		ret = 0;
+out:
+	fdput(f);
+	return ret;
+}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 1bc7e917ce94..6a59ee484d0c 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -19,3 +19,5 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
 int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 		  unsigned int issue_flags);
 void init_hash_table(struct io_hash_table *table, unsigned size);
+
+int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 997b915a1ff7..45538b3c3a76 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3871,6 +3871,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_pbuf_ring(ctx, arg);
 		break;
+	case IORING_REGISTER_SYNC_CANCEL:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_sync_cancel(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 8fcf4c48f44bd7b1b75db139f56ff1ad6477379e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 28 Jun 2022 21:33:20 +0200
Subject: io_uring: replace zero-length array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 094f706c93e0..8fe0275cdaf3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -495,7 +495,7 @@ struct io_uring_probe {
 	__u8 ops_len;	/* length of ops[] array below */
 	__u16 resv;
 	__u32 resv2[3];
-	struct io_uring_probe_op ops[0];
+	struct io_uring_probe_op ops[];
 };
 
 struct io_uring_restriction {
-- 
cgit v1.2.3


From e6130eba8a848a7a6ba6c534bd8f6d60749ae1a9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Jun 2022 04:47:02 -0600
Subject: io_uring: add support for passing fixed file descriptors

With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.

Arguments are extended to pass in:

sqe->addr3	fixed file slot in source ring
sqe->file_index	fixed file slot in destination ring

IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.

Two common use cases for this are:

1) Server needs to be shutdown or restarted, pass file descriptors to
   another onei

2) Backend is split, and one accepts connections, while others then get
  the fd passed and handle the actual connection.

Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.

By default, this will post a CQE to the target ring, similarly to how
IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message
is posted to the target ring. The issuer is expected to notify the
receiver side separately.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  17 ++++++
 io_uring/msg_ring.c           | 130 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 140 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8fe0275cdaf3..f378eabbff21 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -51,6 +51,7 @@ struct io_uring_sqe {
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
+		__u32		msg_ring_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -270,6 +271,22 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+	IORING_MSG_DATA,	/* pass sqe->len as 'res' and off as user_data */
+	IORING_MSG_SEND_FD,	/* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP	Don't post a CQE to the target ring. Not
+ *				applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be2349652..939205b30c8b 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,162 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
 #include "msg_ring.h"
 
 struct io_msg {
 	struct file			*file;
 	u64 user_data;
 	u32 len;
+	u32 cmd;
+	u32 src_fd;
+	u32 dst_fd;
+	u32 flags;
 };
 
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+
+	if (msg->src_fd || msg->dst_fd || msg->flags)
+		return -EINVAL;
+
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		return 0;
+
+	return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+				 struct io_ring_ctx *octx,
+				 unsigned int issue_flags)
+{
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		mutex_unlock(&ctx->uring_lock);
+	mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+			      struct io_ring_ctx *octx,
+			      unsigned int issue_flags)
+{
+	/*
+	 * To ensure proper ordering between the two ctxs, we can only
+	 * attempt a trylock on the target. If that fails and we already have
+	 * the source ctx lock, punt to io-wq.
+	 */
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		if (!mutex_trylock(&octx->uring_lock))
+			return -EAGAIN;
+		return 0;
+	}
+
+	/* Always grab smallest value ctx first. We know ctx != octx. */
+	if (ctx < octx) {
+		mutex_lock(&ctx->uring_lock);
+		mutex_lock(&octx->uring_lock);
+	} else {
+		mutex_lock(&octx->uring_lock);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *target_ctx = req->file->private_data;
+	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long file_ptr;
+	struct file *src_file;
+	int ret;
+
+	if (target_ctx == ctx)
+		return -EINVAL;
+
+	ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+	if (unlikely(ret))
+		return ret;
+
+	ret = -EBADF;
+	if (unlikely(msg->src_fd >= ctx->nr_user_files))
+		goto out_unlock;
+
+	msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+	file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+	src_file = (struct file *) (file_ptr & FFS_MASK);
+	get_file(src_file);
+
+	ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+	if (ret < 0) {
+		fput(src_file);
+		goto out_unlock;
+	}
+
+	if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+		goto out_unlock;
+
+	/*
+	 * If this fails, the target still received the file descriptor but
+	 * wasn't notified of the fact. This means that if this request
+	 * completes with -EOVERFLOW, then the sender must ensure that a
+	 * later IORING_OP_MSG_RING delivers the message.
+	 */
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		ret = -EOVERFLOW;
+out_unlock:
+	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+	return ret;
+}
+
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
 
-	if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
-		     sqe->buf_index || sqe->personality))
+	if (unlikely(sqe->buf_index || sqe->personality))
 		return -EINVAL;
 
 	msg->user_data = READ_ONCE(sqe->off);
 	msg->len = READ_ONCE(sqe->len);
+	msg->cmd = READ_ONCE(sqe->addr);
+	msg->src_fd = READ_ONCE(sqe->addr3);
+	msg->dst_fd = READ_ONCE(sqe->file_index);
+	msg->flags = READ_ONCE(sqe->msg_ring_flags);
+	if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+		return -EINVAL;
+
 	return 0;
 }
 
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_msg *msg = io_kiocb_to_cmd(req);
-	struct io_ring_ctx *target_ctx;
 	int ret;
 
 	ret = -EBADFD;
 	if (!io_is_uring_fops(req->file))
 		goto done;
 
-	ret = -EOVERFLOW;
-	target_ctx = req->file->private_data;
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
-		ret = 0;
+	switch (msg->cmd) {
+	case IORING_MSG_DATA:
+		ret = io_msg_ring_data(req);
+		break;
+	case IORING_MSG_SEND_FD:
+		ret = io_msg_send_fd(req, issue_flags);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
 
 done:
 	if (ret < 0)
-- 
cgit v1.2.3


From 6e73dffbb93cb8797cd4e42e98d837edf0f1a967 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jun 2022 11:55:38 +0100
Subject: io_uring: let to set a range for file slot allocation

From recently io_uring provides an option to allocate a file index for
operation registering fixed files. However, it's utterly unusable with
mixed approaches when for a part of files the userspace knows better
where to place it, as it may race and users don't have any sane way to
pick a slot and hoping it will not be taken.

Let the userspace to register a range of fixed file slots in which the
auto-allocation happens. The use case is splittting the fixed table in
two parts, where on of them is used for auto-allocation and another for
slot-specified operations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/66ab0394e436f38437cf7c44676e1920d09687ad.1656154403.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  | 13 +++++++++++++
 io_uring/filetable.c           | 24 ++++++++++++++++++++----
 io_uring/filetable.h           | 20 +++++++++++++++++---
 io_uring/io_uring.c            |  6 ++++++
 io_uring/rsrc.c                |  2 ++
 6 files changed, 61 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3ca8f363f504..26ef11e978d4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -233,6 +233,9 @@ struct io_ring_ctx {
 
 	unsigned long		check_cq;
 
+	unsigned int		file_alloc_start;
+	unsigned int		file_alloc_end;
+
 	struct {
 		/*
 		 * We cache a range of free CQEs we can use, once exhausted it
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f378eabbff21..cf95354198a3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -449,6 +449,9 @@ enum {
 	/* sync cancelation API */
 	IORING_REGISTER_SYNC_CANCEL		= 24,
 
+	/* register a range of fixed file slots for automatic slot allocation */
+	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -595,4 +598,14 @@ struct io_uring_sync_cancel_reg {
 	__u64				pad[4];
 };
 
+/*
+ * Argument for IORING_REGISTER_FILE_ALLOC_RANGE
+ * The range is specified as [off, off + len)
+ */
+struct io_uring_file_index_range {
+	__u32	off;
+	__u32	len;
+	__u64	resv;
+};
+
 #endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index abaa5ba7f655..7b473259f3f4 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -16,7 +16,7 @@
 static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 {
 	struct io_file_table *table = &ctx->file_table;
-	unsigned long nr = ctx->nr_user_files;
+	unsigned long nr = ctx->file_alloc_end;
 	int ret;
 
 	do {
@@ -24,11 +24,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 		if (ret != nr)
 			return ret;
 
-		if (!table->alloc_hint)
+		if (table->alloc_hint == ctx->file_alloc_start)
 			break;
-
 		nr = table->alloc_hint;
-		table->alloc_hint = 0;
+		table->alloc_hint = ctx->file_alloc_start;
 	} while (1);
 
 	return -ENFILE;
@@ -175,3 +174,20 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	return 0;
 }
+
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg)
+{
+	struct io_uring_file_index_range range;
+	u32 end;
+
+	if (copy_from_user(&range, arg, sizeof(range)))
+		return -EFAULT;
+	if (check_add_overflow(range.off, range.len, &end))
+		return -EOVERFLOW;
+	if (range.resv || end > ctx->nr_user_files)
+		return -EINVAL;
+
+	io_file_table_set_alloc_range(ctx, range.off, range.len);
+	return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 79eb50c1980e..ff3a712e11bf 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -3,9 +3,7 @@
 #define IOU_FILE_TABLE_H
 
 #include <linux/file.h>
-
-struct io_ring_ctx;
-struct io_kiocb;
+#include <linux/io_uring_types.h>
 
 /*
  * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
@@ -33,6 +31,9 @@ int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
 				unsigned int file_slot);
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
 
+int io_register_file_alloc_range(struct io_ring_ctx *ctx,
+				 struct io_uring_file_index_range __user *arg);
+
 unsigned int io_file_get_flags(struct file *file);
 
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
@@ -71,4 +72,17 @@ static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
 	file_slot->file_ptr = file_ptr;
 }
 
+static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
+{
+	ctx->file_table.alloc_hint = ctx->file_alloc_start;
+}
+
+static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
+						 unsigned off, unsigned len)
+{
+	ctx->file_alloc_start = off;
+	ctx->file_alloc_end = off + len;
+	io_reset_alloc_hint(ctx);
+}
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 070ee9ec9ee7..745264938a48 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3866,6 +3866,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sync_cancel(ctx, arg);
 		break;
+	case IORING_REGISTER_FILE_ALLOC_RANGE:
+		ret = -EINVAL;
+		if (!arg || nr_args)
+			break;
+		ret = io_register_file_alloc_range(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 706fa020505b..d2e589c703d0 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1012,6 +1012,8 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
+	/* default it to the whole table */
+	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
 	io_rsrc_node_switch(ctx, NULL);
 	return 0;
 fail:
-- 
cgit v1.2.3


From b3fdea6ecb55c3ceea866ff66486927e51a982b3 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 30 Jun 2022 02:12:29 -0700
Subject: io_uring: multishot recv

Support multishot receive for io_uring.
Typical server applications will run a loop where for each recv CQE it
requeues another recv/recvmsg.

This can be simplified by using the existing multishot functionality
combined with io_uring's provided buffers.
The API is to add the IORING_RECV_MULTISHOT flag to the SQE. CQEs will
then be posted (with IORING_CQE_F_MORE flag set) when data is available
and is read. Once an error occurs or the socket ends, the multishot will
be removed and a completion without IORING_CQE_F_MORE will be posted.

The benefit to this is that the recv is much more performant.
 * Subsequent receives are queued up straight away without requiring the
   application to finish a processing loop.
 * If there are more data in the socket (sat the provided buffer size is
   smaller than the socket buffer) then the data is immediately
   returned, improving batching.
 * Poll is only armed once and reused, saving CPU cycles

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220630091231.1456789-11-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   5 +++
 io_uring/net.c                | 102 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 94 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf95354198a3..499679134961 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -263,8 +263,13 @@ enum io_uring_op {
  *				or receive and arm poll if that yields an
  *				-EAGAIN result, arm poll upfront and skip
  *				the initial transfer attempt.
+ *
+ * IORING_RECV_MULTISHOT	Multishot recv. Sets IORING_CQE_F_MORE if
+ *				the handler will continue to report
+ *				CQEs on behalf of the same SQE.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
+#define IORING_RECV_MULTISHOT	(1U << 1)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index e1eaf902f3b2..cb08a4b62840 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -389,6 +389,8 @@ int io_recvmsg_prep_async(struct io_kiocb *req)
 	return ret;
 }
 
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -399,13 +401,22 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (sr->flags & ~(RECVMSG_FLAGS))
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
+	if (sr->flags & IORING_RECV_MULTISHOT) {
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		if (sr->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		if (req->opcode == IORING_OP_RECV && sr->len)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	}
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -415,6 +426,48 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static inline void io_recv_prep_retry(struct io_kiocb *req)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+	sr->done_io = 0;
+	sr->len = 0; /* get from the provided buffer */
+}
+
+/*
+ * Finishes io_recv and io_recvmsg.
+ *
+ * Returns true if it is actually finished, or false if it should run
+ * again (for multishot).
+ */
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+{
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		io_req_set_res(req, *ret, cflags);
+		*ret = IOU_OK;
+		return true;
+	}
+
+	if (*ret > 0) {
+		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
+				    cflags | IORING_CQE_F_MORE, false)) {
+			io_recv_prep_retry(req);
+			return false;
+		}
+		/*
+		 * Otherwise stop multishot but use the current result.
+		 * Probably will end up going into overflow, but this means
+		 * we cannot trust the ordering anymore
+		 */
+	}
+
+	io_req_set_res(req, *ret, cflags);
+
+	if (req->flags & REQ_F_POLLED)
+		*ret = IOU_STOP_MULTISHOT;
+	return true;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -424,6 +477,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -442,16 +496,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -463,8 +518,15 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	kmsg->msg.msg_get_inq = 1;
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -491,8 +553,11 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -505,6 +570,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = sr->len;
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
@@ -514,16 +580,17 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
 		sr->buf = buf;
 	}
 
-	ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
+	ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
 		goto out_free;
 
@@ -543,8 +610,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 
 	ret = sock_recvmsg(sock, &msg, flags);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
+		if (ret == -EAGAIN && force_nonblock) {
+			if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+
 			return -EAGAIN;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -570,8 +643,11 @@ out_free:
 	cflags = io_put_kbuf(req, issue_flags);
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+
+	if (!io_recv_finish(req, &ret, cflags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
cgit v1.2.3


From 9bb66906f23e50d6db1e11f7498b72dfca1982a2 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Thu, 14 Jul 2022 04:02:58 -0700
Subject: io_uring: support multishot in recvmsg

Similar to multishot recv, this will require provided buffers to be
used. However recvmsg is much more complex than recv as it has multiple
outputs. Specifically flags, name, and control messages.

Support this by introducing a new struct io_uring_recvmsg_out with 4
fields. namelen, controllen and flags match the similar out fields in
msghdr from standard recvmsg(2), payloadlen is the length of the payload
following the header.
This struct is placed at the start of the returned buffer. Based on what
the user specifies in struct msghdr, the next bytes of the buffer will be
name (the next msg_namelen bytes), and then control (the next
msg_controllen bytes). The payload will come at the end. The return value
in the CQE is the total used size of the provided buffer.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220714110258.1336200-4-dylany@fb.com
[axboe: style fixups, see link]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   7 ++
 io_uring/net.c                | 180 +++++++++++++++++++++++++++++++++++++-----
 io_uring/net.h                |   6 ++
 3 files changed, 174 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 499679134961..4c9b11e2e991 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -613,4 +613,11 @@ struct io_uring_file_index_range {
 	__u64	resv;
 };
 
+struct io_uring_recvmsg_out {
+	__u32 namelen;
+	__u32 controllen;
+	__u32 payloadlen;
+	__u32 flags;
+};
+
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index 5bc3440a8290..616d5f04cc74 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -325,6 +325,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+{
+	unsigned long hdr;
+
+	if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
+			       (unsigned long)iomsg->namelen, &hdr))
+		return true;
+	if (check_add_overflow(hdr, iomsg->controllen, &hdr))
+		return true;
+	if (hdr > INT_MAX)
+		return true;
+
+	return false;
+}
+
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
@@ -352,6 +367,13 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = iomsg->fast_iov[0].iov_len;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
@@ -399,6 +421,13 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 			sr->len = clen;
 			iomsg->free_iov = NULL;
 		}
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
 	} else {
 		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
@@ -455,8 +484,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
 	if (sr->flags & IORING_RECV_MULTISHOT) {
-		if (req->opcode == IORING_OP_RECVMSG)
-			return -EINVAL;
 		if (!(req->flags & REQ_F_BUFFER_SELECT))
 			return -EINVAL;
 		if (sr->msg_flags & MSG_WAITALL)
@@ -483,12 +510,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
 }
 
 /*
- * Finishes io_recv
+ * Finishes io_recv and io_recvmsg.
  *
  * Returns true if it is actually finished, or false if it should run
  * again (for multishot).
  */
-static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags)
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
+				  unsigned int cflags, bool mshot_finished)
 {
 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
 		io_req_set_res(req, *ret, cflags);
@@ -496,7 +524,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 		return true;
 	}
 
-	if (*ret > 0) {
+	if (!mshot_finished) {
 		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
 				    cflags | IORING_CQE_F_MORE, false)) {
 			io_recv_prep_retry(req);
@@ -518,6 +546,90 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c
 	return true;
 }
 
+static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
+				     struct io_sr_msg *sr, void __user **buf,
+				     size_t *len)
+{
+	unsigned long ubuf = (unsigned long) *buf;
+	unsigned long hdr;
+
+	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+		kmsg->controllen;
+	if (*len < hdr)
+		return -EFAULT;
+
+	if (kmsg->controllen) {
+		unsigned long control = ubuf + hdr - kmsg->controllen;
+
+		kmsg->msg.msg_control_user = (void *) control;
+		kmsg->msg.msg_controllen = kmsg->controllen;
+	}
+
+	sr->buf = *buf; /* stash for later copy */
+	*buf = (void *) (ubuf + hdr);
+	kmsg->payloadlen = *len = *len - hdr;
+	return 0;
+}
+
+struct io_recvmsg_multishot_hdr {
+	struct io_uring_recvmsg_out msg;
+	struct sockaddr_storage addr;
+};
+
+static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
+				struct io_async_msghdr *kmsg,
+				unsigned int flags, bool *finished)
+{
+	int err;
+	int copy_len;
+	struct io_recvmsg_multishot_hdr hdr;
+
+	if (kmsg->namelen)
+		kmsg->msg.msg_name = &hdr.addr;
+	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	kmsg->msg.msg_namelen = 0;
+
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+
+	err = sock_recvmsg(sock, &kmsg->msg, flags);
+	*finished = err <= 0;
+	if (err < 0)
+		return err;
+
+	hdr.msg = (struct io_uring_recvmsg_out) {
+		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
+		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
+	};
+
+	hdr.msg.payloadlen = err;
+	if (err > kmsg->payloadlen)
+		err = kmsg->payloadlen;
+
+	copy_len = sizeof(struct io_uring_recvmsg_out);
+	if (kmsg->msg.msg_namelen > kmsg->namelen)
+		copy_len += kmsg->namelen;
+	else
+		copy_len += kmsg->msg.msg_namelen;
+
+	/*
+	 *      "fromlen shall refer to the value before truncation.."
+	 *                      1003.1g
+	 */
+	hdr.msg.namelen = kmsg->msg.msg_namelen;
+
+	/* ensure that there is no gap between hdr and sockaddr_storage */
+	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
+		     sizeof(struct io_uring_recvmsg_out));
+	if (copy_to_user(io->buf, &hdr, copy_len)) {
+		*finished = true;
+		return -EFAULT;
+	}
+
+	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+			kmsg->controllen + err;
+}
+
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
@@ -527,6 +639,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool mshot_finished = true;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -545,16 +658,27 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
+		size_t len = sr->len;
 
-		buf = io_buffer_select(req, &sr->len, issue_flags);
+		buf = io_buffer_select(req, &len, issue_flags);
 		if (!buf)
 			return -ENOBUFS;
+
+		if (req->flags & REQ_F_APOLL_MULTISHOT) {
+			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
+			if (ret) {
+				io_kbuf_recycle(req, issue_flags);
+				return ret;
+			}
+		}
+
 		kmsg->fast_iov[0].iov_base = buf;
-		kmsg->fast_iov[0].iov_len = sr->len;
+		kmsg->fast_iov[0].iov_len = len;
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
-				sr->len);
+				len);
 	}
 
 	flags = sr->msg_flags;
@@ -564,10 +688,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
 	kmsg->msg.msg_get_inq = 1;
-	ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
+	if (req->flags & REQ_F_APOLL_MULTISHOT)
+		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
+					   &mshot_finished);
+	else
+		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
+					 kmsg->uaddr, flags);
+
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && force_nonblock)
-			return io_setup_async_msg(req, kmsg, issue_flags);
+		if (ret == -EAGAIN && force_nonblock) {
+			ret = io_setup_async_msg(req, kmsg, issue_flags);
+			if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+					       IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		if (ret > 0 && io_net_retry(sock, flags)) {
@@ -580,11 +717,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	}
 
-	/* fast path, check for non-NULL to avoid function call */
-	if (kmsg->free_iov)
-		kfree(kmsg->free_iov);
-	io_netmsg_recycle(req, issue_flags);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret > 0)
 		ret += sr->done_io;
 	else if (sr->done_io)
@@ -596,8 +728,18 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	if (!io_recv_finish(req, &ret, cflags, mshot_finished))
+		goto retry_multishot;
+
+	if (mshot_finished) {
+		io_netmsg_recycle(req, issue_flags);
+		/* fast path, check for non-NULL to avoid function call */
+		if (kmsg->free_iov)
+			kfree(kmsg->free_iov);
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+	}
+
+	return ret;
 }
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
@@ -684,7 +826,7 @@ out_free:
 	if (msg.msg_inq)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
-	if (!io_recv_finish(req, &ret, cflags))
+	if (!io_recv_finish(req, &ret, cflags, ret <= 0))
 		goto retry_multishot;
 
 	return ret;
diff --git a/io_uring/net.h b/io_uring/net.h
index 178a6d8b76e0..db20ce9d6546 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -9,6 +9,12 @@
 struct io_async_msghdr {
 	union {
 		struct iovec		fast_iov[UIO_FASTIOV];
+		struct {
+			struct iovec	fast_iov_one;
+			__kernel_size_t	controllen;
+			int		namelen;
+			__kernel_size_t	payloadlen;
+		};
 		struct io_cache_entry	cache;
 	};
 	/* points to an allocated iov, if NULL we use fast_iov instead */
-- 
cgit v1.2.3


From bc24d6bd32df0be19df3d30e74be4ba56493c0e2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:42 +0100
Subject: io_uring: add notification slot registration

Let the userspace to register and unregister notification slots.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a0aa8161fe3ebb2a4cc6e5dbd0cffb96e6881cf5.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 17 +++++++++++++++++
 io_uring/io_uring.c           |  9 +++++++++
 io_uring/notif.c              | 43 +++++++++++++++++++++++++++++++++++++++++++
 io_uring/notif.h              |  3 +++
 4 files changed, 72 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4c9b11e2e991..dcfc7a0bda0c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -457,6 +457,10 @@ enum {
 	/* register a range of fixed file slots for automatic slot allocation */
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
+	/* zerocopy notification API */
+	IORING_REGISTER_NOTIFIERS		= 26,
+	IORING_UNREGISTER_NOTIFIERS		= 27,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -503,6 +507,19 @@ struct io_uring_rsrc_update2 {
 	__u32 resv2;
 };
 
+struct io_uring_notification_slot {
+	__u64 tag;
+	__u64 resv[3];
+};
+
+struct io_uring_notification_register {
+	__u32 nr_slots;
+	__u32 resv;
+	__u64 resv2;
+	__u64 data;
+	__u64 resv3;
+};
+
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 20e65d45ca1c..cae11374456e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3870,6 +3870,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_file_alloc_range(ctx, arg);
 		break;
+	case IORING_REGISTER_NOTIFIERS:
+		ret = io_notif_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_NOTIFIERS:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_notif_unregister(ctx);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 0a2e98bd74f6..e6d98dc208c7 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -162,5 +162,48 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx)
 	kvfree(ctx->notif_slots);
 	ctx->notif_slots = NULL;
 	ctx->nr_notif_slots = 0;
+	io_notif_cache_purge(ctx);
+	return 0;
+}
+
+__cold int io_notif_register(struct io_ring_ctx *ctx,
+			     void __user *arg, unsigned int size)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_uring_notification_slot __user *slots;
+	struct io_uring_notification_slot slot;
+	struct io_uring_notification_register reg;
+	unsigned i;
+
+	if (ctx->nr_notif_slots)
+		return -EBUSY;
+	if (size != sizeof(reg))
+		return -EINVAL;
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
+		return -EINVAL;
+	if (reg.resv || reg.resv2 || reg.resv3)
+		return -EINVAL;
+
+	slots = u64_to_user_ptr(reg.data);
+	ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
+				GFP_KERNEL_ACCOUNT);
+	if (!ctx->notif_slots)
+		return -ENOMEM;
+
+	for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
+		struct io_notif_slot *notif_slot = &ctx->notif_slots[i];
+
+		if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
+			io_notif_unregister(ctx);
+			return -EFAULT;
+		}
+		if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
+			io_notif_unregister(ctx);
+			return -EINVAL;
+		}
+		notif_slot->tag = slot.tag;
+	}
 	return 0;
 }
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 1dd48efb7744..00efe164bdc4 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -6,6 +6,7 @@
 #include <linux/nospec.h>
 
 #define IO_NOTIF_SPLICE_BATCH	32
+#define IORING_MAX_NOTIF_SLOTS (1U << 10)
 
 struct io_notif {
 	struct ubuf_info	uarg;
@@ -48,6 +49,8 @@ struct io_notif_slot {
 	u32			seq;
 };
 
+int io_notif_register(struct io_ring_ctx *ctx,
+		      void __user *arg, unsigned int size);
 int io_notif_unregister(struct io_ring_ctx *ctx);
 void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
-- 
cgit v1.2.3


From 06a5464be84e4ae48394d34441baf34bf9706827 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:43 +0100
Subject: io_uring: wire send zc request type

Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from
IORING_OP_SEND is that the user should specify a notification slot
index in sqe::notification_idx and the buffers are safe to reuse only
when the used notification is flushed and completes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a80387c6a68ce9cf99b3b6ef6f71068468761fb7.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 +++
 io_uring/net.c                | 94 +++++++++++++++++++++++++++++++++++++++++++
 io_uring/net.h                |  3 ++
 io_uring/opdef.c              | 15 +++++++
 4 files changed, 117 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index dcfc7a0bda0c..82bf2991e9bd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -66,6 +66,10 @@ struct io_uring_sqe {
 	union {
 		__s32	splice_fd_in;
 		__u32	file_index;
+		struct {
+			__u16	notification_idx;
+			__u16	__pad;
+		};
 	};
 	union {
 		struct {
@@ -197,6 +201,7 @@ enum io_uring_op {
 	IORING_OP_GETXATTR,
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
+	IORING_OP_SENDZC_NOTIF,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index bbc9c603641a..89a8678ce69b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -14,6 +14,7 @@
 #include "kbuf.h"
 #include "alloc_cache.h"
 #include "net.h"
+#include "notif.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -59,6 +60,15 @@ struct io_sr_msg {
 	unsigned int			flags;
 };
 
+struct io_sendzc {
+	struct file			*file;
+	void __user			*buf;
+	size_t				len;
+	u16				slot_idx;
+	unsigned			msg_flags;
+	unsigned			flags;
+};
+
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -834,6 +844,90 @@ out_free:
 	return ret;
 }
 
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+
+	if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) ||
+	    READ_ONCE(sqe->addr3))
+		return -EINVAL;
+
+	zc->flags = READ_ONCE(sqe->ioprio);
+	if (zc->flags & ~IORING_RECVSEND_POLL_FIRST)
+		return -EINVAL;
+
+	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	zc->len = READ_ONCE(sqe->len);
+	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+	zc->slot_idx = READ_ONCE(sqe->notification_idx);
+	if (zc->msg_flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+	return 0;
+}
+
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_notif_slot *notif_slot;
+	struct io_notif *notif;
+	struct msghdr msg;
+	struct iovec iov;
+	struct socket *sock;
+	unsigned msg_flags;
+	int ret, min_ret = 0;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		return -EAGAIN;
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
+	if (!notif_slot)
+		return -EINVAL;
+	notif = io_get_notif(ctx, notif_slot);
+	if (!notif)
+		return -ENOMEM;
+
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+
+	ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		return ret;
+
+	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		msg_flags |= MSG_DONTWAIT;
+	if (msg_flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	msg.msg_flags = msg_flags;
+	msg.msg_ubuf = &notif->uarg;
+	msg.sg_from_iter = NULL;
+	ret = sock_sendmsg(sock, &msg);
+
+	if (unlikely(ret < min_ret)) {
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return -EAGAIN;
+		return ret == -ERESTARTSYS ? -EINTR : ret;
+	}
+
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = io_kiocb_to_cmd(req);
diff --git a/io_uring/net.h b/io_uring/net.h
index db20ce9d6546..7c438d39c089 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req);
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
 
+int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+
 void io_netmsg_cache_free(struct io_cache_entry *entry);
 #else
 static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a7b84b43e6c2..7ab19bbf3126 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -470,6 +470,21 @@ const struct io_op_def io_op_defs[] = {
 		.issue			= io_uring_cmd,
 		.prep_async		= io_uring_cmd_prep_async,
 	},
+	[IORING_OP_SENDZC_NOTIF] = {
+		.name			= "SENDZC_NOTIF",
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_sendzc_prep,
+		.issue			= io_sendzc,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
-- 
cgit v1.2.3


From 092aeedb750a9fad0f0252d6067fc91d76ca44bd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:45 +0100
Subject: io_uring: allow to pass addr into sendzc

Allow to specify an address to zerocopy sends making it more like
sendto(2).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70417a8f7c5b51ab454690bae08adc0c187f89e8.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 +-
 io_uring/net.c                | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 82bf2991e9bd..0736e2773a5d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -68,7 +68,7 @@ struct io_uring_sqe {
 		__u32	file_index;
 		struct {
 			__u16	notification_idx;
-			__u16	__pad;
+			__u16	addr_len;
 		};
 	};
 	union {
diff --git a/io_uring/net.c b/io_uring/net.c
index 2d04a70b0632..61414d865cd7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -67,6 +67,8 @@ struct io_sendzc {
 	u16				slot_idx;
 	unsigned			msg_flags;
 	unsigned			flags;
+	unsigned			addr_len;
+	void __user			*addr;
 };
 
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -848,8 +850,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
 
-	if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) ||
-	    READ_ONCE(sqe->addr3))
+	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
@@ -862,6 +863,10 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	zc->slot_idx = READ_ONCE(sqe->notification_idx);
 	if (zc->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+
+	zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	zc->addr_len = READ_ONCE(sqe->addr_len);
+
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
 		zc->msg_flags |= MSG_CMSG_COMPAT;
@@ -871,6 +876,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 {
+	struct sockaddr_storage address;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
 	struct io_notif_slot *notif_slot;
@@ -908,6 +914,14 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		return ret;
 	mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
 
+	if (zc->addr) {
+		ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
+		if (unlikely(ret < 0))
+			return ret;
+		msg.msg_name = (struct sockaddr *)&address;
+		msg.msg_namelen = zc->addr_len;
+	}
+
 	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		msg_flags |= MSG_DONTWAIT;
-- 
cgit v1.2.3


From 10c7d33ecd51619e453cf6aeee8e326f8ba5cfea Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:46 +0100
Subject: io_uring: sendzc with fixed buffers

Allow zerocopy sends to use fixed buffers. There is an optimisation for
this case, the network layer don't need to reference the pages, see
SKBFL_MANAGED_FRAG_REFS, so io_uring have to ensure validity of fixed
buffers until the notifier is released.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e1d8bd1b5934e541d90c1824eb4020ae3f5f43f3.1657643355.git.asml.silence@gmail.com
[axboe: fold in 32-bit pointer cast warning fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  6 +++++-
 io_uring/net.c                | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0736e2773a5d..f1a9ff9b9ea7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -272,9 +272,13 @@ enum io_uring_op {
  * IORING_RECV_MULTISHOT	Multishot recv. Sets IORING_CQE_F_MORE if
  *				the handler will continue to report
  *				CQEs on behalf of the same SQE.
+ *
+ * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
+ *				the buf_index field.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
-#define IORING_RECV_MULTISHOT	(1U << 1)
+#define IORING_RECV_MULTISHOT		(1U << 1)
+#define IORING_RECVSEND_FIXED_BUF	(1U << 2)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/net.c b/io_uring/net.c
index 61414d865cd7..ab443c52dcfd 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -15,6 +15,7 @@
 #include "alloc_cache.h"
 #include "net.h"
 #include "notif.h"
+#include "rsrc.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -849,13 +850,23 @@ out_free:
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
 
 	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF))
 		return -EINVAL;
+	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+		unsigned idx = READ_ONCE(sqe->buf_index);
+
+		if (unlikely(idx >= ctx->nr_user_bufs))
+			return -EFAULT;
+		idx = array_index_nospec(idx, ctx->nr_user_bufs);
+		req->imu = READ_ONCE(ctx->user_bufs[idx]);
+		io_req_set_rsrc_node(req, ctx, 0);
+	}
 
 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	zc->len = READ_ONCE(sqe->len);
@@ -909,10 +920,18 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
 
-	ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter);
-	if (unlikely(ret))
-		return ret;
-	mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
+	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
+					(u64)(uintptr_t)zc->buf, zc->len);
+		if (unlikely(ret))
+				return ret;
+	} else {
+		ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
+					  &msg.msg_iter);
+		if (unlikely(ret))
+			return ret;
+		mm_account_pinned_pages(&notif->uarg.mmp, zc->len);
+	}
 
 	if (zc->addr) {
 		ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
-- 
cgit v1.2.3


From 63809137ebb58f0aa2ce359117422686e3304f45 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:47 +0100
Subject: io_uring: flush notifiers after sendzc

Allow to flush notifiers as a part of sendzc request by setting
IORING_SENDZC_FLUSH flag. When the sendzc request succeedes it will
flush the used [active] notifier.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e0b4d9a6797e2fd6092824fe42953db7a519bbc8.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  4 ++++
 io_uring/io_uring.c           | 11 +----------
 io_uring/io_uring.h           | 10 ++++++++++
 io_uring/net.c                |  5 ++++-
 io_uring/notif.c              |  2 +-
 io_uring/notif.h              | 11 +++++++++++
 6 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1a9ff9b9ea7..45272eb37d10 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -275,10 +275,14 @@ enum io_uring_op {
  *
  * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
  *				the buf_index field.
+ *
+ * IORING_RECVSEND_NOTIF_FLUSH	Flush a notification after a successful
+ *				successful. Only for zerocopy sends.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
+#define IORING_RECVSEND_NOTIF_FLUSH	(1U << 3)
 
 /*
  * accept flags stored in sqe->ioprio
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cae11374456e..1d600a63643b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -621,7 +621,7 @@ void __io_put_task(struct task_struct *task, int nr)
 	put_task_struct_many(task, nr);
 }
 
-static void io_task_refs_refill(struct io_uring_task *tctx)
+void io_task_refs_refill(struct io_uring_task *tctx)
 {
 	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
 
@@ -630,15 +630,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx)
 	tctx->cached_refs += refill;
 }
 
-static inline void io_get_task_refs(int nr)
-{
-	struct io_uring_task *tctx = current->io_uring;
-
-	tctx->cached_refs -= nr;
-	if (unlikely(tctx->cached_refs < 0))
-		io_task_refs_refill(tctx);
-}
-
 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 {
 	struct io_uring_task *tctx = task->io_uring;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 66bfd880d07f..cc81a9d1fd4d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -74,6 +74,7 @@ void io_wq_submit_work(struct io_wq_work *work);
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
 void __io_put_task(struct task_struct *task, int nr);
+void io_task_refs_refill(struct io_uring_task *tctx);
 
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
@@ -270,4 +271,13 @@ static inline void io_put_task(struct task_struct *task, int nr)
 		__io_put_task(task, nr);
 }
 
+static inline void io_get_task_refs(int nr)
+{
+	struct io_uring_task *tctx = current->io_uring;
+
+	tctx->cached_refs -= nr;
+	if (unlikely(tctx->cached_refs < 0))
+		io_task_refs_refill(tctx);
+}
+
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index ab443c52dcfd..9ac2ce37c522 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -856,7 +856,8 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
-	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF))
+	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
+			  IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
 		return -EINVAL;
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		unsigned idx = READ_ONCE(sqe->buf_index);
@@ -958,6 +959,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		return ret == -ERESTARTSYS ? -EINTR : ret;
 	}
 
+	if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH)
+		io_notif_slot_flush_submit(notif_slot, 0);
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
diff --git a/io_uring/notif.c b/io_uring/notif.c
index c5179e5c1cd6..a93887451bbb 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -133,7 +133,7 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 	return notif;
 }
 
-static void io_notif_slot_flush(struct io_notif_slot *slot)
+void io_notif_slot_flush(struct io_notif_slot *slot)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_notif *notif = slot->notif;
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 00efe164bdc4..6cd73d7b965b 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -54,6 +54,7 @@ int io_notif_register(struct io_ring_ctx *ctx,
 int io_notif_unregister(struct io_ring_ctx *ctx);
 void io_notif_cache_purge(struct io_ring_ctx *ctx);
 
+void io_notif_slot_flush(struct io_notif_slot *slot);
 struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot);
 
@@ -74,3 +75,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
 	idx = array_index_nospec(idx, ctx->nr_notif_slots);
 	return &ctx->notif_slots[idx];
 }
+
+static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
+					      unsigned int issue_flags)
+{
+	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+		slot->notif->task = current;
+		io_get_task_refs(1);
+	}
+	io_notif_slot_flush(slot);
+}
-- 
cgit v1.2.3


From 4379d5f15b3fd4224c37841029178aa8082a242e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:48 +0100
Subject: io_uring: rename IORING_OP_FILES_UPDATE

IORING_OP_FILES_UPDATE will be a more generic opcode serving different
resource types, rename it into IORING_OP_RSRC_UPDATE and add subtype
handling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0a907133907d9af3415a8a7aa1802c6aa97c03c6.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 12 +++++++++++-
 io_uring/opdef.c              |  9 +++++----
 io_uring/rsrc.c               | 17 +++++++++++++++--
 io_uring/rsrc.h               |  4 ++--
 4 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 45272eb37d10..210a00ab6301 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -174,7 +174,8 @@ enum io_uring_op {
 	IORING_OP_FALLOCATE,
 	IORING_OP_OPENAT,
 	IORING_OP_CLOSE,
-	IORING_OP_FILES_UPDATE,
+	IORING_OP_RSRC_UPDATE,
+	IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
 	IORING_OP_STATX,
 	IORING_OP_READ,
 	IORING_OP_WRITE,
@@ -223,6 +224,7 @@ enum io_uring_op {
 #define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 #define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
+
 /*
  * sqe->splice_flags
  * extends splice(2) flags
@@ -289,6 +291,14 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
+
+/*
+ * IORING_OP_RSRC_UPDATE flags
+ */
+enum {
+	IORING_RSRC_UPDATE_FILES,
+};
+
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
  */
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 7ab19bbf3126..72dd2b2d8a9d 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -246,12 +246,13 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_close_prep,
 		.issue			= io_close,
 	},
-	[IORING_OP_FILES_UPDATE] = {
+	[IORING_OP_RSRC_UPDATE] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "FILES_UPDATE",
-		.prep			= io_files_update_prep,
-		.issue			= io_files_update,
+		.name			= "RSRC_UPDATE",
+		.prep			= io_rsrc_update_prep,
+		.issue			= io_rsrc_update,
+		.ioprio			= 1,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7f66b0e25674..fc2b337e6c25 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -21,6 +21,7 @@ struct io_rsrc_update {
 	u64				arg;
 	u32				nr_args;
 	u32				offset;
+	int				type;
 };
 
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
@@ -657,7 +658,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	return -EINVAL;
 }
 
-int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
 
@@ -671,6 +672,7 @@ int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!up->nr_args)
 		return -EINVAL;
 	up->arg = READ_ONCE(sqe->addr);
+	up->type = READ_ONCE(sqe->ioprio);
 	return 0;
 }
 
@@ -713,7 +715,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	return ret;
 }
 
-int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
+static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
 	struct io_ring_ctx *ctx = req->ctx;
@@ -742,6 +744,17 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+
+	switch (up->type) {
+	case IORING_RSRC_UPDATE_FILES:
+		return io_files_update(req, issue_flags);
+	}
+	return -EINVAL;
+}
+
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc)
 {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index af342fd239d0..21813a23215f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -167,6 +167,6 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 	return &data->tags[table_idx][off];
 }
 
-int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
-int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags);
+int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 #endif
-- 
cgit v1.2.3


From 492dddb4f6e3a5839c27d41ff1fecdbe6c3ab851 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Jul 2022 21:52:49 +0100
Subject: io_uring: add zc notification flush requests

Overlay notification control onto IORING_OP_RSRC_UPDATE (former
IORING_OP_FILES_UPDATE). It allows to flush a range of zc notifications
from slots with indexes [sqe->off, sqe->off+sqe->len). If sqe->arg is
not zero, it also copies sqe->arg as a new tag for all flushed
notifications.

Note, it doesn't flush a notification of a slot if there was no requests
attached to it (since last flush or registration).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/df13e2363400682a73dd9e71c3b990b8d1ff0333.1657643355.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/rsrc.c               | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 210a00ab6301..1463cfecb56b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -297,6 +297,7 @@ enum io_uring_op {
  */
 enum {
 	IORING_RSRC_UPDATE_FILES,
+	IORING_RSRC_UPDATE_NOTIF,
 };
 
 /*
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fc2b337e6c25..9165fdf64269 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -15,6 +15,7 @@
 #include "io_uring.h"
 #include "openclose.h"
 #include "rsrc.h"
+#include "notif.h"
 
 struct io_rsrc_update {
 	struct file			*file;
@@ -744,6 +745,41 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_ring_ctx *ctx = req->ctx;
+	unsigned len = up->nr_args;
+	unsigned idx_end, idx = up->offset;
+	int ret = 0;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	if (unlikely(check_add_overflow(idx, len, &idx_end))) {
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	if (unlikely(idx_end > ctx->nr_notif_slots)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (; idx < idx_end; idx++) {
+		struct io_notif_slot *slot = &ctx->notif_slots[idx];
+
+		if (!slot->notif)
+			continue;
+		if (up->arg)
+			slot->tag = up->arg;
+		io_notif_slot_flush_submit(slot, issue_flags);
+	}
+out:
+	io_ring_submit_unlock(ctx, issue_flags);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
+
 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
@@ -751,6 +787,8 @@ int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 	switch (up->type) {
 	case IORING_RSRC_UPDATE_FILES:
 		return io_files_update(req, issue_flags);
+	case IORING_RSRC_UPDATE_NOTIF:
+		return io_notif_update(req, issue_flags);
 	}
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From b4023554b1fb49f73a09e5f346a5facbf27d7383 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 25 Jul 2022 09:58:35 +0200
Subject: USB: cdc: add control-signal defines

Add defines for the Control Signal Bitmap Values from section 6.2.14
SetControlLineState of the CDC specification version 1.1.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20220725075841.1187-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/cdc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 6d61550959ef..372c81425cae 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -271,6 +271,10 @@ struct usb_cdc_line_coding {
 	__u8	bDataBits;
 } __attribute__ ((packed));
 
+/* Control Signal Bitmap Values from 6.2.14 SetControlLineState */
+#define USB_CDC_CTRL_DTR			(1 << 0)
+#define USB_CDC_CTRL_RTS			(1 << 1)
+
 /* table 62; bits in multicast filter */
 #define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
 #define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
-- 
cgit v1.2.3


From a0a3202b44a9fdf2a1f6330a0d176aee76c8631d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 25 Jul 2022 09:58:36 +0200
Subject: USB: cdc: add serial-state defines

Add defines for the serial-state bitmap values from section 6.3.5
SerialState of the CDC specification version 1.1.

Note that the bTxCarrier and bRxCarrier bits have been named after their
RS-232 signal equivalents DSR and DCD.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20220725075841.1187-3-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/cdc.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h
index 372c81425cae..78caa9bdc4ae 100644
--- a/include/uapi/linux/usb/cdc.h
+++ b/include/uapi/linux/usb/cdc.h
@@ -306,6 +306,15 @@ struct usb_cdc_notification {
 	__le16	wLength;
 } __attribute__ ((packed));
 
+/* UART State Bitmap Values from 6.3.5 SerialState */
+#define USB_CDC_SERIAL_STATE_DCD		(1 << 0)
+#define USB_CDC_SERIAL_STATE_DSR		(1 << 1)
+#define USB_CDC_SERIAL_STATE_BREAK		(1 << 2)
+#define USB_CDC_SERIAL_STATE_RING_SIGNAL	(1 << 3)
+#define USB_CDC_SERIAL_STATE_FRAMING		(1 << 4)
+#define USB_CDC_SERIAL_STATE_PARITY		(1 << 5)
+#define USB_CDC_SERIAL_STATE_OVERRUN		(1 << 6)
+
 struct usb_cdc_speed_change {
 	__le32	DLBitRRate;	/* contains the downlink bit rate (IN pipe) */
 	__le32	ULBitRate;	/* contains the uplink bit rate (OUT pipe) */
-- 
cgit v1.2.3


From b7c14f23fb604fc66edae7514ed9b4b93930b5ba Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 17 Mar 2022 10:25:39 -0700
Subject: btrfs: send: add stream v2 definitions

This adds the definitions of the new commands for send stream version 2
and their respective attributes: fallocate, FS_IOC_SETFLAGS (a.k.a.
chattr), and encoded writes. It also documents two changes to the send
stream format in v2: the receiver shouldn't assume a maximum command
size, and the DATA attribute is encoded differently to allow for writes
larger than 64k. These will be implemented in subsequent changes, and
then the ioctl will accept the new version and flag.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c            |  2 +-
 fs/btrfs/send.h            | 40 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/btrfs.h |  7 +++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 237753860758..6ec31736c522 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7552,7 +7552,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 	sctx->clone_roots_cnt = arg->clone_sources_count;
 
-	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
 	sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
 	if (!sctx->send_buf) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index c47a2984aa5b..858ce8132614 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -12,7 +12,11 @@
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
 #define BTRFS_SEND_STREAM_VERSION 1
 
-#define BTRFS_SEND_BUF_SIZE SZ_64K
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1				SZ_64K
 
 enum btrfs_tlv_type {
 	BTRFS_TLV_U8,
@@ -80,16 +84,20 @@ enum btrfs_send_cmd {
 	BTRFS_SEND_C_MAX_V1		= 22,
 
 	/* Version 2 */
-	BTRFS_SEND_C_MAX_V2		= 22,
+	BTRFS_SEND_C_FALLOCATE		= 23,
+	BTRFS_SEND_C_SETFLAGS		= 24,
+	BTRFS_SEND_C_ENCODED_WRITE	= 25,
+	BTRFS_SEND_C_MAX_V2		= 25,
 
 	/* End */
-	BTRFS_SEND_C_MAX		= 22,
+	BTRFS_SEND_C_MAX		= 25,
 };
 
 /* attributes in send stream */
 enum {
 	BTRFS_SEND_A_UNSPEC		= 0,
 
+	/* Version 1 */
 	BTRFS_SEND_A_UUID		= 1,
 	BTRFS_SEND_A_CTRANSID		= 2,
 
@@ -112,6 +120,11 @@ enum {
 	BTRFS_SEND_A_PATH_LINK		= 17,
 
 	BTRFS_SEND_A_FILE_OFFSET	= 18,
+	/*
+	 * As of send stream v2, this attribute is special: it must be the last
+	 * attribute in a command, its header contains only the type, and its
+	 * length is implicitly the remaining length of the command.
+	 */
 	BTRFS_SEND_A_DATA		= 19,
 
 	BTRFS_SEND_A_CLONE_UUID		= 20,
@@ -120,7 +133,26 @@ enum {
 	BTRFS_SEND_A_CLONE_OFFSET	= 23,
 	BTRFS_SEND_A_CLONE_LEN		= 24,
 
-	BTRFS_SEND_A_MAX		= 24,
+	BTRFS_SEND_A_MAX_V1		= 24,
+
+	/* Version 2 */
+	BTRFS_SEND_A_FALLOCATE_MODE	= 25,
+
+	BTRFS_SEND_A_SETFLAGS_FLAGS	= 26,
+
+	BTRFS_SEND_A_UNENCODED_FILE_LEN	= 27,
+	BTRFS_SEND_A_UNENCODED_LEN	= 28,
+	BTRFS_SEND_A_UNENCODED_OFFSET	= 29,
+	/*
+	 * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+	 * BTRFS_SEND_C_ENCODED_WRITE.
+	 */
+	BTRFS_SEND_A_COMPRESSION	= 30,
+	BTRFS_SEND_A_ENCRYPTION		= 31,
+	BTRFS_SEND_A_MAX_V2		= 31,
+
+	/* End */
+	BTRFS_SEND_A_MAX		= 31,
 };
 
 #ifdef __KERNEL__
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..b6f26a434b10 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -777,6 +777,13 @@ struct btrfs_ioctl_received_subvol_args {
  */
 #define BTRFS_SEND_FLAG_VERSION			0x8
 
+/*
+ * Send compressed data using the ENCODED_WRITE command instead of decompressing
+ * the data and sending it with the WRITE command. This requires protocol
+ * version >= 2.
+ */
+#define BTRFS_SEND_FLAG_COMPRESSED		0x10
+
 #define BTRFS_SEND_FLAG_MASK \
 	(BTRFS_SEND_FLAG_NO_FILE_DATA | \
 	 BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
-- 
cgit v1.2.3


From d6815592806f481244d0e3435ca1f5383d90a14c Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 17 Mar 2022 10:25:43 -0700
Subject: btrfs: send: enable support for stream v2 and compressed writes

Now that the new support is implemented, allow the ioctl to accept v2
and the compressed flag, and update the version in sysfs.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c            | 7 +++++--
 fs/btrfs/send.h            | 2 +-
 include/uapi/linux/btrfs.h | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bc00393c1233..6d01dc26d408 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -701,8 +701,7 @@ static int send_header(struct send_ctx *sctx)
 	struct btrfs_stream_header hdr;
 
 	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
-	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+	hdr.version = cpu_to_le32(sctx->proto);
 	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
 					&sctx->send_off);
 }
@@ -7755,6 +7754,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	} else {
 		sctx->proto = 1;
 	}
+	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	sctx->send_filp = fget(arg->send_fd);
 	if (!sctx->send_filp) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 858ce8132614..b0dc07567d09 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -10,7 +10,7 @@
 #include "ctree.h"
 
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+#define BTRFS_SEND_STREAM_VERSION 2
 
 /*
  * In send stream v1, no command is larger than 64K. In send stream v2, no limit
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index b6f26a434b10..f54dc91e4025 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -788,7 +788,8 @@ struct btrfs_ioctl_received_subvol_args {
 	(BTRFS_SEND_FLAG_NO_FILE_DATA | \
 	 BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
 	 BTRFS_SEND_FLAG_OMIT_END_CMD | \
-	 BTRFS_SEND_FLAG_VERSION)
+	 BTRFS_SEND_FLAG_VERSION | \
+	 BTRFS_SEND_FLAG_COMPRESSED)
 
 struct btrfs_ioctl_send_args {
 	__s64 send_fd;			/* in */
-- 
cgit v1.2.3


From 5008750eff5d4af8a3aed4a7567c4cfb2b3cb156 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Mon, 18 Jul 2022 14:18:11 +0200
Subject: net/sched: flower: Add PPPoE filter

Add support for PPPoE specific fields for tc-flower.
Those fields can be provided only when protocol was set
to ETH_P_PPP_SES. Defines, dump, load and set are being done here.

Overwrite basic.n_proto only in case of PPP_IP and PPP_IPV6,
otherwise leave it as ETH_P_PPP_SES.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/uapi/linux/pkt_cls.h |  3 +++
 net/sched/cls_flower.c       | 64 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9a2ee1e39fad..c142c0f8ed8a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -589,6 +589,9 @@ enum {
 
 	TCA_FLOWER_KEY_NUM_OF_VLANS,    /* u8 */
 
+	TCA_FLOWER_KEY_PPPOE_SID,	/* be16 */
+	TCA_FLOWER_KEY_PPP_PROTO,	/* be16 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1a1e34480b7e..041d63ff809a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -16,6 +16,7 @@
 #include <linux/in6.h>
 #include <linux/ip.h>
 #include <linux/mpls.h>
+#include <linux/ppp_defs.h>
 
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
@@ -67,6 +68,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_ct ct;
 	struct flow_dissector_key_hash hash;
 	struct flow_dissector_key_num_of_vlans num_of_vlans;
+	struct flow_dissector_key_pppoe pppoe;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -708,6 +710,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_HASH]		= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_HASH_MASK]	= { .type = NLA_U32 },
 	[TCA_FLOWER_KEY_NUM_OF_VLANS]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_PPPOE_SID]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_PPP_PROTO]	= { .type = NLA_U16 },
 
 };
 
@@ -1035,6 +1039,50 @@ static void fl_set_key_vlan(struct nlattr **tb,
 	}
 }
 
+static void fl_set_key_pppoe(struct nlattr **tb,
+			     struct flow_dissector_key_pppoe *key_val,
+			     struct flow_dissector_key_pppoe *key_mask,
+			     struct fl_flow_key *key,
+			     struct fl_flow_key *mask)
+{
+	/* key_val::type must be set to ETH_P_PPP_SES
+	 * because ETH_P_PPP_SES was stored in basic.n_proto
+	 * which might get overwritten by ppp_proto
+	 * or might be set to 0, the role of key_val::type
+	 * is simmilar to vlan_key::tpid
+	 */
+	key_val->type = htons(ETH_P_PPP_SES);
+	key_mask->type = cpu_to_be16(~0);
+
+	if (tb[TCA_FLOWER_KEY_PPPOE_SID]) {
+		key_val->session_id =
+			nla_get_be16(tb[TCA_FLOWER_KEY_PPPOE_SID]);
+		key_mask->session_id = cpu_to_be16(~0);
+	}
+	if (tb[TCA_FLOWER_KEY_PPP_PROTO]) {
+		key_val->ppp_proto =
+			nla_get_be16(tb[TCA_FLOWER_KEY_PPP_PROTO]);
+		key_mask->ppp_proto = cpu_to_be16(~0);
+
+		if (key_val->ppp_proto == htons(PPP_IP)) {
+			key->basic.n_proto = htons(ETH_P_IP);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_IPV6)) {
+			key->basic.n_proto = htons(ETH_P_IPV6);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_MPLS_UC)) {
+			key->basic.n_proto = htons(ETH_P_MPLS_UC);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		} else if (key_val->ppp_proto == htons(PPP_MPLS_MC)) {
+			key->basic.n_proto = htons(ETH_P_MPLS_MC);
+			mask->basic.n_proto = cpu_to_be16(~0);
+		}
+	} else {
+		key->basic.n_proto = 0;
+		mask->basic.n_proto = cpu_to_be16(0);
+	}
+}
+
 static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
 			    u32 *dissector_key, u32 *dissector_mask,
 			    u32 flower_flag_bit, u32 dissector_flag_bit)
@@ -1645,6 +1693,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		}
 	}
 
+	if (key->basic.n_proto == htons(ETH_P_PPP_SES))
+		fl_set_key_pppoe(tb, &key->pppoe, &mask->pppoe, key, mask);
+
 	if (key->basic.n_proto == htons(ETH_P_IP) ||
 	    key->basic.n_proto == htons(ETH_P_IPV6)) {
 		fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
@@ -1917,6 +1968,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_HASH, hash);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_PPPOE, pppoe);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -3045,6 +3098,17 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 	    fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
 		goto nla_put_failure;
 
+	if (mask->pppoe.session_id) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_PPPOE_SID,
+				 key->pppoe.session_id))
+			goto nla_put_failure;
+	}
+	if (mask->basic.n_proto && mask->pppoe.ppp_proto) {
+		if (nla_put_be16(skb, TCA_FLOWER_KEY_PPP_PROTO,
+				 key->pppoe.ppp_proto))
+			goto nla_put_failure;
+	}
+
 	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			     &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
-- 
cgit v1.2.3


From 7b2d9a1a50ec3bedf067fe234a4a71196c89e826 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Jul 2022 10:29:16 +0200
Subject: net: devlink: introduce nested devlink entity for line card

For the purpose of exposing device info and allow flash update which is
going to be implemented in follow-up patches, introduce a possibility
for a line card to expose relation to nested devlink entity. The nested
devlink entity represents the line card.

Example:

$ devlink lc show pci/0000:01:00.0 lc 1
pci/0000:01:00.0:
  lc 1 state active type 16x100G nested_devlink auxiliary/mlxsw_core.lc.0
    supported_types:
       16x100G
$ devlink dev show auxiliary/mlxsw_core.lc.0
auxiliary/mlxsw_core.lc.0

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  2 ++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 780744b550b8..5bd3fac12e9e 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1580,6 +1580,8 @@ void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 void devlink_linecard_activate(struct devlink_linecard *linecard);
 void devlink_linecard_deactivate(struct devlink_linecard *linecard);
+void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				    struct devlink *nested_devlink);
 int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
 		     u32 size, u16 ingress_pools_count,
 		     u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b3d40a5d72ff..541321695f52 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -576,6 +576,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_LINECARD_TYPE,		/* string */
 	DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES,	/* nested */
 
+	DEVLINK_ATTR_NESTED_DEVLINK,		/* nested */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 865232a1455f..698b2d6e0ec7 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -89,6 +89,7 @@ struct devlink_linecard {
 	const char *type;
 	struct devlink_linecard_type *types;
 	unsigned int types_count;
+	struct devlink *nested_devlink;
 };
 
 /**
@@ -856,6 +857,24 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
 	return 0;
 }
 
+static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	struct nlattr *nested_attr;
+
+	nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK);
+	if (!nested_attr)
+		return -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, nested_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, nested_attr);
+	return -EMSGSIZE;
+}
+
 struct devlink_reload_combination {
 	enum devlink_reload_action action;
 	enum devlink_reload_limit limit;
@@ -2135,6 +2154,10 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg,
 		nla_nest_end(msg, attr);
 	}
 
+	if (linecard->nested_devlink &&
+	    devlink_nl_put_nested_handle(msg, linecard->nested_devlink))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -10255,6 +10278,7 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
 	linecard->type = NULL;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
@@ -10273,6 +10297,7 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
 {
 	mutex_lock(&linecard->state_lock);
+	WARN_ON(linecard->nested_devlink);
 	linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
 	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
 	mutex_unlock(&linecard->state_lock);
@@ -10320,6 +10345,23 @@ void devlink_linecard_deactivate(struct devlink_linecard *linecard)
 }
 EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
 
+/**
+ *	devlink_linecard_nested_dl_set - Attach/detach nested devlink
+ *					 instance to linecard.
+ *
+ *	@linecard: devlink linecard
+ *	@nested_devlink: devlink instance to attach or NULL to detach
+ */
+void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+				    struct devlink *nested_devlink)
+{
+	mutex_lock(&linecard->state_lock);
+	linecard->nested_devlink = nested_devlink;
+	devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+	mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
+
 int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
 		     u32 size, u16 ingress_pools_count,
 		     u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From a0c0c44e9aa2b5da876467083c359b368f3ce95e Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Wed, 20 Jul 2022 12:43:28 +0000
Subject: s390: add ELF note type for encrypted CPU state of a PV VCPU

The type NT_S390_PV_CPU_DATA note contains the encrypted CPU state of
a PV VCPU. It's only relevant in dumps of s390 PV VMs and can't be
decrypted without a second block of encrypted data which provides key
parts. Therefore we only reserve the note type here.

The zgetdump tool from the s390-tools package can, together with a
Customer Communication Key, be used to convert a PV VM dump into a
normal VM dump. zgetdump will decrypt the CPU data and overwrite the
other respective notes to make the data accessible for crash and other
debugging tools.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
[agordeev@linux.ibm.com changed desctiption]
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 include/uapi/linux/elf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 2b9f5e9985e5..c7b056af9ef0 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -420,6 +420,7 @@ typedef struct elf64_shdr {
 #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
 #define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
 #define NT_S390_RI_CB	0x30d		/* s390 runtime instrumentation */
+#define NT_S390_PV_CPU_DATA	0x30e	/* s390 protvirt cpu dump data */
 #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
 #define NT_ARM_TLS	0x401		/* ARM TLS register */
 #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
-- 
cgit v1.2.3


From 9dd1cd3220eca534f2d47afad7ce85f4c40118d8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 20 Jul 2022 13:58:04 -0400
Subject: dm: fix dm-raid crash if md_handle_request() splits bio

Commit ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone")
introduced the optimization to _not_ perform bio_associate_blkg()'s
relatively costly work when DM core clones its bio. But in doing so it
exposed the possibility for DM's cloned bio to alter DM target
behavior (e.g. crash) if a target were to issue IO without first
calling bio_set_dev().

The DM raid target can trigger an MD crash due to its need to split
the DM bio that is passed to md_handle_request(). The split will
recurse to submit_bio_noacct() using a bio with an uninitialized
->bi_blkg. This NULL bio->bi_blkg causes blk_throtl_bio() to
dereference a NULL blkg_to_tg(bio->bi_blkg).

Fix this in DM core by adding a new 'needs_bio_set_dev' target flag that
will make alloc_tio() call bio_set_dev() on behalf of the target.
dm-raid is the only target that requires this flag. bio_set_dev()
initializes the DM cloned bio's ->bi_blkg, using bio_associate_blkg,
before passing the bio to md_handle_request().

Long-term fix would be to audit and refactor MD code to rely on DM to
split its bio, using dm_accept_partial_bio(), but there are MD raid
personalities (e.g. raid1 and raid10) whose implementation are tightly
coupled to handling the bio splitting inline.

Fixes: ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone")
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-raid.c          |  1 +
 drivers/md/dm.c               | 13 ++++++-------
 include/linux/device-mapper.h |  6 ++++++
 include/uapi/linux/dm-ioctl.h |  4 ++--
 4 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e6a9b8cb22d3..3203aecd6961 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3095,6 +3095,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 	ti->num_flush_bios = 1;
+	ti->needs_bio_set_dev = true;
 
 	/* Restore any requested new layout for conversion decision */
 	rs_config_restore(rs, &rs_layout);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 47bcc5081b2b..f6a6437d0a7c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -574,9 +574,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct bio *clone;
 
 	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
-	/* Set default bdev, but target must bio_set_dev() before issuing IO */
-	clone->bi_bdev = md->disk->part0;
-
 	tio = clone_to_tio(clone);
 	tio->flags = 0;
 	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
@@ -609,6 +606,7 @@ static void free_io(struct dm_io *io)
 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 			     unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
 {
+	struct mapped_device *md = ci->io->md;
 	struct dm_target_io *tio;
 	struct bio *clone;
 
@@ -618,14 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 		/* alloc_io() already initialized embedded clone */
 		clone = &tio->clone;
 	} else {
-		struct mapped_device *md = ci->io->md;
-
 		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
 					&md->mempools->bs);
 		if (!clone)
 			return NULL;
-		/* Set default bdev, but target must bio_set_dev() before issuing IO */
-		clone->bi_bdev = md->disk->part0;
 
 		/* REQ_DM_POLL_LIST shouldn't be inherited */
 		clone->bi_opf &= ~REQ_DM_POLL_LIST;
@@ -641,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 	tio->len_ptr = len;
 	tio->old_sector = 0;
 
+	/* Set default bdev, but target must bio_set_dev() before issuing IO */
+	clone->bi_bdev = md->disk->part0;
+	if (unlikely(ti->needs_bio_set_dev))
+		bio_set_dev(clone, md->disk->part0);
+
 	if (len) {
 		clone->bi_iter.bi_size = to_bytes(*len);
 		if (bio_integrity(clone))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 920085dd7f3b..04c6acf7faaa 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -373,6 +373,12 @@ struct dm_target {
 	 * after returning DM_MAPIO_SUBMITTED from its map function.
 	 */
 	bool accounts_remapped_io:1;
+
+	/*
+	 * Set if the target will submit the DM bio without first calling
+	 * bio_set_dev(). NOTE: ideally a target should _not_ need this.
+	 */
+	bool needs_bio_set_dev:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 2e9550fef90f..27ad9671f2df 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -286,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	46
+#define DM_VERSION_MINOR	47
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2022-02-22)"
+#define DM_VERSION_EXTRA	"-ioctl (2022-07-28)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 08f588fa301bef264576fc915da6bf31b585a824 Mon Sep 17 00:00:00 2001
From: Vikas Gupta <vikas.gupta@broadcom.com>
Date: Wed, 27 Jul 2022 22:27:20 +0530
Subject: devlink: introduce framework for selftests

Add a framework for running selftests.
Framework exposes devlink commands and test suite(s) to the user
to execute and query the supported tests by the driver.

Below are new entries in devlink_nl_ops
devlink_nl_cmd_selftests_show_doit/dumpit: To query the supported
selftests by the drivers.
devlink_nl_cmd_selftests_run: To execute selftests. Users can
provide a test mask for executing group tests or standalone tests.

Documentation/networking/devlink/ path is already part of MAINTAINERS &
the new files come under this path. Hence no update needed to the
MAINTAINERS

Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/devlink/devlink-selftests.rst       |  38 ++++
 include/net/devlink.h                              |  21 ++
 include/uapi/linux/devlink.h                       |  29 +++
 net/core/devlink.c                                 | 216 +++++++++++++++++++++
 4 files changed, 304 insertions(+)
 create mode 100644 Documentation/networking/devlink/devlink-selftests.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/devlink/devlink-selftests.rst b/Documentation/networking/devlink/devlink-selftests.rst
new file mode 100644
index 000000000000..c0aa1f3aef0d
--- /dev/null
+++ b/Documentation/networking/devlink/devlink-selftests.rst
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+=================
+Devlink Selftests
+=================
+
+The ``devlink-selftests`` API allows executing selftests on the device.
+
+Tests Mask
+==========
+The ``devlink-selftests`` command should be run with a mask indicating
+the tests to be executed.
+
+Tests Description
+=================
+The following is a list of tests that drivers may execute.
+
+.. list-table:: List of tests
+   :widths: 5 90
+
+   * - Name
+     - Description
+   * - ``DEVLINK_SELFTEST_FLASH``
+     - Devices may have the firmware on non-volatile memory on the board, e.g.
+       flash. This particular test helps to run a flash selftest on the device.
+       Implementation of the test is left to the driver/firmware.
+
+example usage
+-------------
+
+.. code:: shell
+
+    # Query selftests supported on the devlink device
+    $ devlink dev selftests show DEV
+    # Query selftests supported on all devlink devices
+    $ devlink dev selftests show
+    # Executes selftests on the device
+    $ devlink dev selftests run DEV id flash
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5bd3fac12e9e..119ed1ffb988 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1509,6 +1509,27 @@ struct devlink_ops {
 				    struct devlink_rate *parent,
 				    void *priv_child, void *priv_parent,
 				    struct netlink_ext_ack *extack);
+	/**
+	 * selftests_check() - queries if selftest is supported
+	 * @devlink: devlink instance
+	 * @id: test index
+	 * @extack: extack for reporting error messages
+	 *
+	 * Return: true if test is supported by the driver
+	 */
+	bool (*selftest_check)(struct devlink *devlink, unsigned int id,
+			       struct netlink_ext_ack *extack);
+	/**
+	 * selftest_run() - Runs a selftest
+	 * @devlink: devlink instance
+	 * @id: test index
+	 * @extack: extack for reporting error messages
+	 *
+	 * Return: status of the test
+	 */
+	enum devlink_selftest_status
+	(*selftest_run)(struct devlink *devlink, unsigned int id,
+			struct netlink_ext_ack *extack);
 };
 
 void *devlink_priv(struct devlink *devlink);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 541321695f52..2f24b53a87a5 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -136,6 +136,9 @@ enum devlink_command {
 	DEVLINK_CMD_LINECARD_NEW,
 	DEVLINK_CMD_LINECARD_DEL,
 
+	DEVLINK_CMD_SELFTESTS_GET,	/* can dump */
+	DEVLINK_CMD_SELFTESTS_RUN,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
@@ -276,6 +279,30 @@ enum {
 #define DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS \
 	(_BITUL(__DEVLINK_FLASH_OVERWRITE_MAX_BIT) - 1)
 
+enum devlink_attr_selftest_id {
+	DEVLINK_ATTR_SELFTEST_ID_UNSPEC,
+	DEVLINK_ATTR_SELFTEST_ID_FLASH,	/* flag */
+
+	__DEVLINK_ATTR_SELFTEST_ID_MAX,
+	DEVLINK_ATTR_SELFTEST_ID_MAX = __DEVLINK_ATTR_SELFTEST_ID_MAX - 1
+};
+
+enum devlink_selftest_status {
+	DEVLINK_SELFTEST_STATUS_SKIP,
+	DEVLINK_SELFTEST_STATUS_PASS,
+	DEVLINK_SELFTEST_STATUS_FAIL
+};
+
+enum devlink_attr_selftest_result {
+	DEVLINK_ATTR_SELFTEST_RESULT_UNSPEC,
+	DEVLINK_ATTR_SELFTEST_RESULT,		/* nested */
+	DEVLINK_ATTR_SELFTEST_RESULT_ID,	/* u32, enum devlink_attr_selftest_id */
+	DEVLINK_ATTR_SELFTEST_RESULT_STATUS,	/* u8, enum devlink_selftest_status */
+
+	__DEVLINK_ATTR_SELFTEST_RESULT_MAX,
+	DEVLINK_ATTR_SELFTEST_RESULT_MAX = __DEVLINK_ATTR_SELFTEST_RESULT_MAX - 1
+};
+
 /**
  * enum devlink_trap_action - Packet trap action.
  * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not
@@ -578,6 +605,8 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_NESTED_DEVLINK,		/* nested */
 
+	DEVLINK_ATTR_SELFTESTS,			/* nested */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ca4c9939d569..efeba223b9b8 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -201,6 +201,10 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
 };
 
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+	[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
+
 static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
 #define DEVLINK_REGISTERED XA_MARK_1
 
@@ -4826,6 +4830,206 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
 	return ret;
 }
 
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+			  u32 portid, u32 seq, int flags,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *selftests;
+	void *hdr;
+	int err;
+	int i;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+			  DEVLINK_CMD_SELFTESTS_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	err = -EMSGSIZE;
+	if (devlink_nl_put_handle(msg, devlink))
+		goto err_cancel_msg;
+
+	selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+	if (!selftests)
+		goto err_cancel_msg;
+
+	for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+	     i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+		if (devlink->ops->selftest_check(devlink, i, extack)) {
+			err = nla_put_flag(msg, i);
+			if (err)
+				goto err_cancel_msg;
+		}
+	}
+
+	nla_nest_end(msg, selftests);
+	genlmsg_end(msg, hdr);
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb,
+					     struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink->ops->selftest_check)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+					info->snd_seq, 0, info->extack);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg,
+					       struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	unsigned long index;
+	int idx = 0;
+	int err = 0;
+
+	mutex_lock(&devlink_mutex);
+	devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+		if (idx < start || !devlink->ops->selftest_check)
+			goto inc;
+
+		devl_lock(devlink);
+		err = devlink_nl_selftests_fill(msg, devlink,
+						NETLINK_CB(cb->skb).portid,
+						cb->nlh->nlmsg_seq, NLM_F_MULTI,
+						cb->extack);
+		devl_unlock(devlink);
+		if (err) {
+			devlink_put(devlink);
+			break;
+		}
+inc:
+		idx++;
+		devlink_put(devlink);
+	}
+	mutex_unlock(&devlink_mutex);
+
+	if (err != -EMSGSIZE)
+		return err;
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+				       enum devlink_selftest_status test_status)
+{
+	struct nlattr *result_attr;
+
+	result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+	if (!result_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+	    nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+		       test_status))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, result_attr);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, result_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_selftests_run(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+	struct devlink *devlink = info->user_ptr[0];
+	struct nlattr *attrs, *selftests;
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+	int i;
+
+	if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[DEVLINK_ATTR_SELFTESTS]) {
+		NL_SET_ERR_MSG_MOD(info->extack, "selftest required");
+		return -EINVAL;
+	}
+
+	attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+	err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+			       devlink_selftest_nl_policy, info->extack);
+	if (err < 0)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = -EMSGSIZE;
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+	if (!hdr)
+		goto free_msg;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto genlmsg_cancel;
+
+	selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+	if (!selftests)
+		goto genlmsg_cancel;
+
+	for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+	     i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+		enum devlink_selftest_status test_status;
+
+		if (nla_get_flag(tb[i])) {
+			if (!devlink->ops->selftest_check(devlink, i,
+							  info->extack)) {
+				if (devlink_selftest_result_put(msg, i,
+								DEVLINK_SELFTEST_STATUS_SKIP))
+					goto selftests_nest_cancel;
+				continue;
+			}
+
+			test_status = devlink->ops->selftest_run(devlink, i,
+								 info->extack);
+			if (devlink_selftest_result_put(msg, i, test_status))
+				goto selftests_nest_cancel;
+		}
+	}
+
+	nla_nest_end(msg, selftests);
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+	nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
 static const struct devlink_param devlink_param_generic[] = {
 	{
 		.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
@@ -8969,6 +9173,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
@@ -9328,6 +9533,17 @@ static const struct genl_small_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_trap_policer_set_doit,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = DEVLINK_CMD_SELFTESTS_GET,
+		.doit = devlink_nl_cmd_selftests_get_doit,
+		.dumpit = devlink_nl_cmd_selftests_get_dumpit
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_SELFTESTS_RUN,
+		.doit = devlink_nl_cmd_selftests_run,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
cgit v1.2.3


From b07c8cdbe918aa17da864da9a89b22afaed0393e Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Wed, 27 Jul 2022 20:54:05 +0200
Subject: seg6: add support for SRv6 H.Encaps.Red behavior

The SRv6 H.Encaps.Red behavior described in [1] is an optimization of
the SRv6 H.Encaps behavior [2].

H.Encaps.Red reduces the length of the SRH by excluding the first
segment (SID) in the SRH of the pushed IPv6 header. The first SID is
only placed in the IPv6 Destination Address field of the pushed IPv6
header.
When the SRv6 Policy only contains one SID the SRH is omitted, unless
there is an HMAC TLV to be carried.

[1] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.2
[2] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.1

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h |   1 +
 net/ipv6/seg6_iptunnel.c           | 128 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index eb815e0d0ac3..538152a7b2c3 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -35,6 +35,7 @@ enum {
 	SEG6_IPTUN_MODE_INLINE,
 	SEG6_IPTUN_MODE_ENCAP,
 	SEG6_IPTUN_MODE_L2ENCAP,
+	SEG6_IPTUN_MODE_ENCAP_RED,
 };
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index e756ba705fd9..454bd8a838e6 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -36,6 +36,7 @@ static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 	case SEG6_IPTUN_MODE_INLINE:
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		head = sizeof(struct ipv6hdr);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
@@ -197,6 +198,124 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 }
 EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
 
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+static int seg6_do_srh_encap_red(struct sk_buff *skb,
+				 struct ipv6_sr_hdr *osrh, int proto)
+{
+	__u8 first_seg = osrh->first_segment;
+	struct dst_entry *dst = skb_dst(skb);
+	struct net *net = dev_net(dst->dev);
+	struct ipv6hdr *hdr, *inner_hdr;
+	int hdrlen = ipv6_optlen(osrh);
+	int red_tlv_offset, tlv_offset;
+	struct ipv6_sr_hdr *isrh;
+	bool skip_srh = false;
+	__be32 flowlabel;
+	int tot_len, err;
+	int red_hdrlen;
+	int tlvs_len;
+
+	if (first_seg > 0) {
+		red_hdrlen = hdrlen - sizeof(struct in6_addr);
+	} else {
+		/* NOTE: if tag/flags and/or other TLVs are introduced in the
+		 * seg6_iptunnel infrastructure, they should be considered when
+		 * deciding to skip the SRH.
+		 */
+		skip_srh = !sr_has_hmac(osrh);
+
+		red_hdrlen = skip_srh ? 0 : hdrlen;
+	}
+
+	tot_len = red_hdrlen + sizeof(struct ipv6hdr);
+
+	err = skb_cow_head(skb, tot_len + skb->mac_len);
+	if (unlikely(err))
+		return err;
+
+	inner_hdr = ipv6_hdr(skb);
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+	skb_push(skb, tot_len);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+	hdr = ipv6_hdr(skb);
+
+	/* based on seg6_do_srh_encap() */
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+			     flowlabel);
+		hdr->hop_limit = inner_hdr->hop_limit;
+	} else {
+		ip6_flow_hdr(hdr, 0, flowlabel);
+		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+		IP6CB(skb)->iif = skb->skb_iif;
+	}
+
+	/* no matter if we have to skip the SRH or not, the first segment
+	 * always comes in the pushed IPv6 header.
+	 */
+	hdr->daddr = osrh->segments[first_seg];
+
+	if (skip_srh) {
+		hdr->nexthdr = proto;
+
+		set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+		goto out;
+	}
+
+	/* we cannot skip the SRH, slow path */
+
+	hdr->nexthdr = NEXTHDR_ROUTING;
+	isrh = (void *)hdr + sizeof(struct ipv6hdr);
+
+	if (unlikely(!first_seg)) {
+		/* this is a very rare case; we have only one SID but
+		 * we cannot skip the SRH since we are carrying some
+		 * other info.
+		 */
+		memcpy(isrh, osrh, hdrlen);
+		goto srcaddr;
+	}
+
+	tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr);
+	red_tlv_offset = tlv_offset - sizeof(struct in6_addr);
+
+	memcpy(isrh, osrh, red_tlv_offset);
+
+	tlvs_len = hdrlen - tlv_offset;
+	if (unlikely(tlvs_len > 0)) {
+		const void *s = (const void *)osrh + tlv_offset;
+		void *d = (void *)isrh + red_tlv_offset;
+
+		memcpy(d, s, tlvs_len);
+	}
+
+	--isrh->first_segment;
+	isrh->hdrlen -= 2;
+
+srcaddr:
+	isrh->nexthdr = proto;
+	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
+		err = seg6_push_hmac(net, &hdr->saddr, isrh);
+		if (unlikely(err))
+			return err;
+	}
+#endif
+
+out:
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	skb_postpush_rcsum(skb, hdr, tot_len);
+
+	return 0;
+}
+
 /* insert an SRH within an IPv6 packet, just after the IPv6 header */
 int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
@@ -269,6 +388,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 			return err;
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
+	case SEG6_IPTUN_MODE_ENCAP_RED:
 		err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
 		if (err)
 			return err;
@@ -280,7 +400,11 @@ static int seg6_do_srh(struct sk_buff *skb)
 		else
 			return -EINVAL;
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh, proto);
+
 		if (err)
 			return err;
 
@@ -517,6 +641,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_ENCAP_RED:
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 13f0296be8ece1189cbc4383a45ba97cafaecc09 Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Wed, 27 Jul 2022 20:54:06 +0200
Subject: seg6: add support for SRv6 H.L2Encaps.Red behavior

The SRv6 H.L2Encaps.Red behavior described in [1] is an optimization of
the SRv6 H.L2Encaps behavior [2].

H.L2Encaps.Red reduces the length of the SRH by excluding the first
segment (SID) in the SRH of the pushed IPv6 header. The first SID is
only placed in the IPv6 Destination Address field of the pushed IPv6
header.
When the SRv6 Policy only contains one SID the SRH is omitted, unless
there is an HMAC TLV to be carried.

[1] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.4
[2] - https://datatracker.ietf.org/doc/html/rfc8986#section-5.3

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Signed-off-by: Anton Makarov <anton.makarov11235@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h |  1 +
 net/ipv6/seg6_iptunnel.c           | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index 538152a7b2c3..a9fa777f16de 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -36,6 +36,7 @@ enum {
 	SEG6_IPTUN_MODE_ENCAP,
 	SEG6_IPTUN_MODE_L2ENCAP,
 	SEG6_IPTUN_MODE_ENCAP_RED,
+	SEG6_IPTUN_MODE_L2ENCAP_RED,
 };
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 454bd8a838e6..34db881204d2 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -40,6 +40,7 @@ static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 		head = sizeof(struct ipv6hdr);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		return 0;
 	}
 
@@ -413,6 +414,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	case SEG6_IPTUN_MODE_L2ENCAP:
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
 		if (!skb_mac_header_was_set(skb))
 			return -EINVAL;
 
@@ -422,7 +424,13 @@ static int seg6_do_srh(struct sk_buff *skb)
 		skb_mac_header_rebuild(skb);
 		skb_push(skb, skb->mac_len);
 
-		err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
+		if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+			err = seg6_do_srh_encap(skb, tinfo->srh,
+						IPPROTO_ETHERNET);
+		else
+			err = seg6_do_srh_encap_red(skb, tinfo->srh,
+						    IPPROTO_ETHERNET);
+
 		if (err)
 			return err;
 
@@ -643,6 +651,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_ENCAP_RED:
 		break;
+	case SEG6_IPTUN_MODE_L2ENCAP_RED:
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 8a061562e2f2b32bfb5bff5bf3afc64e37d95a27 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Fri, 29 Jul 2022 17:14:53 +0530
Subject: RISC-V: KVM: Add extensible CSR emulation framework

We add an extensible CSR emulation framework which is based upon the
existing system instruction emulation. This will be useful to upcoming
AIA, PMU, Nested and other virtualization features.

The CSR emulation framework also has provision to emulate CSR in user
space but this will be used only in very specific cases such as AIA
IMSIC CSR emulation in user space or vendor specific CSR emulation
in user space.

By default, all CSRs not handled by KVM RISC-V will be redirected back
to Guest VCPU as illegal instruction trap.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 arch/riscv/include/asm/kvm_host.h      |   5 +
 arch/riscv/include/asm/kvm_vcpu_insn.h |   6 ++
 arch/riscv/kvm/vcpu.c                  |  34 ++++---
 arch/riscv/kvm/vcpu_insn.c             | 172 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h               |   8 ++
 5 files changed, 209 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index d482d771eba4..59a0cf2ca7b9 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -65,6 +65,8 @@ struct kvm_vcpu_stat {
 	u64 wfi_exit_stat;
 	u64 mmio_exit_user;
 	u64 mmio_exit_kernel;
+	u64 csr_exit_user;
+	u64 csr_exit_kernel;
 	u64 exits;
 };
 
@@ -210,6 +212,9 @@ struct kvm_vcpu_arch {
 	/* MMIO instruction details */
 	struct kvm_mmio_decode mmio_decode;
 
+	/* CSR instruction details */
+	struct kvm_csr_decode csr_decode;
+
 	/* SBI context */
 	struct kvm_sbi_context sbi_context;
 
diff --git a/arch/riscv/include/asm/kvm_vcpu_insn.h b/arch/riscv/include/asm/kvm_vcpu_insn.h
index 3351eb61a251..350011c83581 100644
--- a/arch/riscv/include/asm/kvm_vcpu_insn.h
+++ b/arch/riscv/include/asm/kvm_vcpu_insn.h
@@ -18,6 +18,11 @@ struct kvm_mmio_decode {
 	int return_handled;
 };
 
+struct kvm_csr_decode {
+	unsigned long insn;
+	int return_handled;
+};
+
 /* Return values used by function emulating a particular instruction */
 enum kvm_insn_return {
 	KVM_INSN_EXIT_TO_USER_SPACE = 0,
@@ -28,6 +33,7 @@ enum kvm_insn_return {
 };
 
 void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
 				struct kvm_cpu_trap *trap);
 
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index a3c051cb070c..3c95924d38c7 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -26,6 +26,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
 	STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
 	STATS_DESC_COUNTER(VCPU, mmio_exit_user),
 	STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+	STATS_DESC_COUNTER(VCPU, csr_exit_user),
+	STATS_DESC_COUNTER(VCPU, csr_exit_kernel),
 	STATS_DESC_COUNTER(VCPU, exits)
 };
 
@@ -899,22 +901,26 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	kvm_vcpu_srcu_read_lock(vcpu);
 
-	/* Process MMIO value returned from user-space */
-	if (run->exit_reason == KVM_EXIT_MMIO) {
+	switch (run->exit_reason) {
+	case KVM_EXIT_MMIO:
+		/* Process MMIO value returned from user-space */
 		ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run);
-		if (ret) {
-			kvm_vcpu_srcu_read_unlock(vcpu);
-			return ret;
-		}
-	}
-
-	/* Process SBI value returned from user-space */
-	if (run->exit_reason == KVM_EXIT_RISCV_SBI) {
+		break;
+	case KVM_EXIT_RISCV_SBI:
+		/* Process SBI value returned from user-space */
 		ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run);
-		if (ret) {
-			kvm_vcpu_srcu_read_unlock(vcpu);
-			return ret;
-		}
+		break;
+	case KVM_EXIT_RISCV_CSR:
+		/* Process CSR value returned from user-space */
+		ret = kvm_riscv_vcpu_csr_return(vcpu, vcpu->run);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	if (ret) {
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		return ret;
 	}
 
 	if (run->immediate_exit) {
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
index 75ca62a7fba5..7eb90a47b571 100644
--- a/arch/riscv/kvm/vcpu_insn.c
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -14,6 +14,19 @@
 #define INSN_MASK_WFI		0xffffffff
 #define INSN_MATCH_WFI		0x10500073
 
+#define INSN_MATCH_CSRRW	0x1073
+#define INSN_MASK_CSRRW		0x707f
+#define INSN_MATCH_CSRRS	0x2073
+#define INSN_MASK_CSRRS		0x707f
+#define INSN_MATCH_CSRRC	0x3073
+#define INSN_MASK_CSRRC		0x707f
+#define INSN_MATCH_CSRRWI	0x5073
+#define INSN_MASK_CSRRWI	0x707f
+#define INSN_MATCH_CSRRSI	0x6073
+#define INSN_MASK_CSRRSI	0x707f
+#define INSN_MATCH_CSRRCI	0x7073
+#define INSN_MASK_CSRRCI	0x707f
+
 #define INSN_MATCH_LB		0x3
 #define INSN_MASK_LB		0x707f
 #define INSN_MATCH_LH		0x1003
@@ -71,6 +84,7 @@
 #define SH_RS1			15
 #define SH_RS2			20
 #define SH_RS2C			2
+#define MASK_RX			0x1f
 
 #define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
 #define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
@@ -104,7 +118,7 @@
 #define REG_PTR(insn, pos, regs)	\
 	((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)))
 
-#define GET_RM(insn)		(((insn) >> 12) & 7)
+#define GET_FUNCT3(insn)	(((insn) >> 12) & 7)
 
 #define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
 #define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
@@ -116,7 +130,6 @@
 #define IMM_I(insn)		((s32)(insn) >> 20)
 #define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
 				 (s32)(((insn) >> 7) & 0x1f))
-#define MASK_FUNCT3		0x7000
 
 struct insn_func {
 	unsigned long mask;
@@ -189,7 +202,162 @@ static int wfi_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
 	return KVM_INSN_CONTINUE_NEXT_SEPC;
 }
 
+struct csr_func {
+	unsigned int base;
+	unsigned int count;
+	/*
+	 * Possible return values are as same as "func" callback in
+	 * "struct insn_func".
+	 */
+	int (*func)(struct kvm_vcpu *vcpu, unsigned int csr_num,
+		    unsigned long *val, unsigned long new_val,
+		    unsigned long wr_mask);
+};
+
+static const struct csr_func csr_funcs[] = { };
+
+/**
+ * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space
+ *				emulation or in-kernel emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run:  The VCPU run struct containing the CSR data
+ *
+ * Returns > 0 upon failure and 0 upon success
+ */
+int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	ulong insn;
+
+	if (vcpu->arch.csr_decode.return_handled)
+		return 0;
+	vcpu->arch.csr_decode.return_handled = 1;
+
+	/* Update destination register for CSR reads */
+	insn = vcpu->arch.csr_decode.insn;
+	if ((insn >> SH_RD) & MASK_RX)
+		SET_RD(insn, &vcpu->arch.guest_context,
+		       run->riscv_csr.ret_value);
+
+	/* Move to next instruction */
+	vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+
+	return 0;
+}
+
+static int csr_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
+{
+	int i, rc = KVM_INSN_ILLEGAL_TRAP;
+	unsigned int csr_num = insn >> SH_RS2;
+	unsigned int rs1_num = (insn >> SH_RS1) & MASK_RX;
+	ulong rs1_val = GET_RS1(insn, &vcpu->arch.guest_context);
+	const struct csr_func *tcfn, *cfn = NULL;
+	ulong val = 0, wr_mask = 0, new_val = 0;
+
+	/* Decode the CSR instruction */
+	switch (GET_FUNCT3(insn)) {
+	case GET_FUNCT3(INSN_MATCH_CSRRW):
+		wr_mask = -1UL;
+		new_val = rs1_val;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRS):
+		wr_mask = rs1_val;
+		new_val = -1UL;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRC):
+		wr_mask = rs1_val;
+		new_val = 0;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRWI):
+		wr_mask = -1UL;
+		new_val = rs1_num;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRSI):
+		wr_mask = rs1_num;
+		new_val = -1UL;
+		break;
+	case GET_FUNCT3(INSN_MATCH_CSRRCI):
+		wr_mask = rs1_num;
+		new_val = 0;
+		break;
+	default:
+		return rc;
+	}
+
+	/* Save instruction decode info */
+	vcpu->arch.csr_decode.insn = insn;
+	vcpu->arch.csr_decode.return_handled = 0;
+
+	/* Update CSR details in kvm_run struct */
+	run->riscv_csr.csr_num = csr_num;
+	run->riscv_csr.new_value = new_val;
+	run->riscv_csr.write_mask = wr_mask;
+	run->riscv_csr.ret_value = 0;
+
+	/* Find in-kernel CSR function */
+	for (i = 0; i < ARRAY_SIZE(csr_funcs); i++) {
+		tcfn = &csr_funcs[i];
+		if ((tcfn->base <= csr_num) &&
+		    (csr_num < (tcfn->base + tcfn->count))) {
+			cfn = tcfn;
+			break;
+		}
+	}
+
+	/* First try in-kernel CSR emulation */
+	if (cfn && cfn->func) {
+		rc = cfn->func(vcpu, csr_num, &val, new_val, wr_mask);
+		if (rc > KVM_INSN_EXIT_TO_USER_SPACE) {
+			if (rc == KVM_INSN_CONTINUE_NEXT_SEPC) {
+				run->riscv_csr.ret_value = val;
+				vcpu->stat.csr_exit_kernel++;
+				kvm_riscv_vcpu_csr_return(vcpu, run);
+				rc = KVM_INSN_CONTINUE_SAME_SEPC;
+			}
+			return rc;
+		}
+	}
+
+	/* Exit to user-space for CSR emulation */
+	if (rc <= KVM_INSN_EXIT_TO_USER_SPACE) {
+		vcpu->stat.csr_exit_user++;
+		run->exit_reason = KVM_EXIT_RISCV_CSR;
+	}
+
+	return rc;
+}
+
 static const struct insn_func system_opcode_funcs[] = {
+	{
+		.mask  = INSN_MASK_CSRRW,
+		.match = INSN_MATCH_CSRRW,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRS,
+		.match = INSN_MATCH_CSRRS,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRC,
+		.match = INSN_MATCH_CSRRC,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRWI,
+		.match = INSN_MATCH_CSRRWI,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRSI,
+		.match = INSN_MATCH_CSRRSI,
+		.func  = csr_insn,
+	},
+	{
+		.mask  = INSN_MASK_CSRRCI,
+		.match = INSN_MATCH_CSRRCI,
+		.func  = csr_insn,
+	},
 	{
 		.mask  = INSN_MASK_WFI,
 		.match = INSN_MATCH_WFI,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 860f867c50c0..0c1f42a40fd3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -270,6 +270,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_X86_BUS_LOCK     33
 #define KVM_EXIT_XEN              34
 #define KVM_EXIT_RISCV_SBI        35
+#define KVM_EXIT_RISCV_CSR        36
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -496,6 +497,13 @@ struct kvm_run {
 			unsigned long args[6];
 			unsigned long ret[2];
 		} riscv_sbi;
+		/* KVM_EXIT_RISCV_CSR */
+		struct {
+			unsigned long csr_num;
+			unsigned long new_value;
+			unsigned long write_mask;
+			unsigned long ret_value;
+		} riscv_csr;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3


From 68f2736a858324c3ec852f6c2cddd9d1c777357d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Jun 2022 15:38:48 -0400
Subject: mm: Convert all PageMovable users to movable_operations

These drivers are rather uncomfortably hammered into the
address_space_operations hole.  They aren't filesystems and don't behave
like filesystems.  They just need their own movable_operations structure,
which we can point to directly from page->mapping.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 Documentation/filesystems/locking.rst |   4 --
 Documentation/filesystems/vfs.rst     |  12 ----
 Documentation/vm/page_migration.rst   | 113 +++-------------------------------
 arch/powerpc/platforms/pseries/cmm.c  |  60 +-----------------
 drivers/misc/vmw_balloon.c            |  61 +-----------------
 drivers/virtio/virtio_balloon.c       |  47 +-------------
 include/linux/balloon_compaction.h    |   6 +-
 include/linux/fs.h                    |   2 -
 include/linux/migrate.h               |  56 +++++++++++++++--
 include/linux/page-flags.h            |   2 +-
 include/uapi/linux/magic.h            |   4 --
 mm/balloon_compaction.c               |  10 ++-
 mm/compaction.c                       |  29 ++++-----
 mm/migrate.c                          |  24 ++++----
 mm/util.c                             |   4 +-
 mm/z3fold.c                           |  84 +++----------------------
 mm/zsmalloc.c                         | 102 +++++++-----------------------
 17 files changed, 134 insertions(+), 486 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index c0fe711f14d3..9963d9600b71 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -252,9 +252,7 @@ prototypes::
 	bool (*release_folio)(struct folio *, gfp_t);
 	void (*free_folio)(struct folio *);
 	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-	bool (*isolate_page) (struct page *, isolate_mode_t);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
-	void (*putback_page) (struct page *);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
 	int (*error_remove_page)(struct address_space *, struct page *);
@@ -280,9 +278,7 @@ invalidate_folio:	yes					exclusive
 release_folio:		yes
 free_folio:		yes
 direct_IO:
-isolate_page:		yes
 migratepage:		yes (both)
-putback_page:		yes
 launder_folio:		yes
 is_partially_uptodate:	yes
 error_remove_page:	yes
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index a08c652467d7..b51665cdabc4 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -737,12 +737,8 @@ cache in your filesystem.  The following members are defined:
 		bool (*release_folio)(struct folio *, gfp_t);
 		void (*free_folio)(struct folio *);
 		ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
-		/* isolate a page for migration */
-		bool (*isolate_page) (struct page *, isolate_mode_t);
 		/* migrate the contents of a page to the specified target */
 		int (*migratepage) (struct page *, struct page *);
-		/* put migration-failed page back to right list */
-		void (*putback_page) (struct page *);
 		int (*launder_folio) (struct folio *);
 
 		bool (*is_partially_uptodate) (struct folio *, size_t from,
@@ -930,11 +926,6 @@ cache in your filesystem.  The following members are defined:
 	data directly between the storage and the application's address
 	space.
 
-``isolate_page``
-	Called by the VM when isolating a movable non-lru page.  If page
-	is successfully isolated, VM marks the page as PG_isolated via
-	__SetPageIsolated.
-
 ``migrate_page``
 	This is used to compact the physical memory usage.  If the VM
 	wants to relocate a page (maybe off a memory card that is
@@ -942,9 +933,6 @@ cache in your filesystem.  The following members are defined:
 	page to this function.  migrate_page should transfer any private
 	data across and update any references that it has to the page.
 
-``putback_page``
-	Called by the VM when isolated page's migration fails.
-
 ``launder_folio``
 	Called before freeing a folio - it writes back the dirty folio.
 	To prevent redirtying the folio, it is kept locked during the
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index 8c5cb8147e55..11493bad7112 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -152,110 +152,15 @@ Steps:
 Non-LRU page migration
 ======================
 
-Although migration originally aimed for reducing the latency of memory accesses
-for NUMA, compaction also uses migration to create high-order pages.
+Although migration originally aimed for reducing the latency of memory
+accesses for NUMA, compaction also uses migration to create high-order
+pages.  For compaction purposes, it is also useful to be able to move
+non-LRU pages, such as zsmalloc and virtio-balloon pages.
 
-Current problem of the implementation is that it is designed to migrate only
-*LRU* pages. However, there are potential non-LRU pages which can be migrated
-in drivers, for example, zsmalloc, virtio-balloon pages.
-
-For virtio-balloon pages, some parts of migration code path have been hooked
-up and added virtio-balloon specific functions to intercept migration logics.
-It's too specific to a driver so other drivers who want to make their pages
-movable would have to add their own specific hooks in the migration path.
-
-To overcome the problem, VM supports non-LRU page migration which provides
-generic functions for non-LRU movable pages without driver specific hooks
-in the migration path.
-
-If a driver wants to make its pages movable, it should define three functions
-which are function pointers of struct address_space_operations.
-
-1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-
-   What VM expects from isolate_page() function of driver is to return *true*
-   if driver isolates the page successfully. On returning true, VM marks the page
-   as PG_isolated so concurrent isolation in several CPUs skip the page
-   for isolation. If a driver cannot isolate the page, it should return *false*.
-
-   Once page is successfully isolated, VM uses page.lru fields so driver
-   shouldn't expect to preserve values in those fields.
-
-2. ``int (*migratepage) (struct address_space *mapping,``
-|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-
-   After isolation, VM calls migratepage() of driver with the isolated page.
-   The function of migratepage() is to move the contents of the old page to the
-   new page
-   and set up fields of struct page newpage. Keep in mind that you should
-   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-   under page_lock if you migrated the oldpage successfully and returned
-   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-   because VM interprets -EAGAIN as "temporary migration failure". On returning
-   any error except -EAGAIN, VM will give up the page migration without
-   retrying.
-
-   Driver shouldn't touch the page.lru field while in the migratepage() function.
-
-3. ``void (*putback_page)(struct page *);``
-
-   If migration fails on the isolated page, VM should return the isolated page
-   to the driver so VM calls the driver's putback_page() with the isolated page.
-   In this function, the driver should put the isolated page back into its own data
-   structure.
-
-Non-LRU movable page flags
-
-   There are two page flags for supporting non-LRU movable page.
-
-   * PG_movable
-
-     Driver should use the function below to make page movable under page_lock::
-
-	void __SetPageMovable(struct page *page, struct address_space *mapping)
-
-     It needs argument of address_space for registering migration
-     family functions which will be called by VM. Exactly speaking,
-     PG_movable is not a real flag of struct page. Rather, VM
-     reuses the page->mapping's lower bits to represent it::
-
-	#define PAGE_MAPPING_MOVABLE 0x2
-	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-
-     so driver shouldn't access page->mapping directly. Instead, driver should
-     use page_mapping() which masks off the low two bits of page->mapping under
-     page lock so it can get the right struct address_space.
-
-     For testing of non-LRU movable pages, VM supports __PageMovable() function.
-     However, it doesn't guarantee to identify non-LRU movable pages because
-     the page->mapping field is unified with other variables in struct page.
-     If the driver releases the page after isolation by VM, page->mapping
-     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
-     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
-     page is LRU or non-LRU movable once the page has been isolated because LRU
-     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
-     good for just peeking to test non-LRU movable pages before more expensive
-     checking with lock_page() in pfn scanning to select a victim.
-
-     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
-     Unlike __PageMovable(), PageMovable() validates page->mapping and
-     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
-     sudden destroying of page->mapping.
-
-     Drivers using __SetPageMovable() should clear the flag via
-     __ClearMovablePage() under page_lock() before the releasing the page.
-
-   * PG_isolated
-
-     To prevent concurrent isolation among several CPUs, VM marks isolated page
-     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
-     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
-     flag because VM will set/clear it automatically. Keep in mind that if the
-     driver sees a PG_isolated page, it means the page has been isolated by the
-     VM so it shouldn't touch the page.lru field.
-     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
-     shouldn't use PG_isolated for its own purposes.
+If a driver wants to make its pages movable, it should define a struct
+movable_operations.  It then needs to call __SetPageMovable() on each
+page that it may be able to move.  This uses the ``page->mapping`` field,
+so this field is not available for the driver to use for other purposes.
 
 Monitoring Migration
 =====================
@@ -286,3 +191,5 @@ THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
 
 Christoph Lameter, May 8, 2006.
 Minchan Kim, Mar 28, 2016.
+
+.. kernel-doc:: include/linux/migrate.h
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 15ed8206c463..5f4037c1d7fe 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -19,9 +19,6 @@
 #include <linux/stringify.h>
 #include <linux/swap.h>
 #include <linux/device.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/magic.h>
 #include <linux/balloon_compaction.h>
 #include <asm/firmware.h>
 #include <asm/hvcall.h>
@@ -500,19 +497,6 @@ static struct notifier_block cmm_mem_nb = {
 };
 
 #ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-
-static int cmm_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name = "ppc-cmm",
-	.init_fs_context = cmm_init_fs_context,
-	.kill_sb = kill_anon_super,
-};
-
 static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 			   struct page *newpage, struct page *page,
 			   enum migrate_mode mode)
@@ -564,47 +548,13 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 	return MIGRATEPAGE_SUCCESS;
 }
 
-static int cmm_balloon_compaction_init(void)
+static void cmm_balloon_compaction_init(void)
 {
-	int rc;
-
 	balloon_devinfo_init(&b_dev_info);
 	b_dev_info.migratepage = cmm_migratepage;
-
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		rc = PTR_ERR(balloon_mnt);
-		balloon_mnt = NULL;
-		return rc;
-	}
-
-	b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
-	if (IS_ERR(b_dev_info.inode)) {
-		rc = PTR_ERR(b_dev_info.inode);
-		b_dev_info.inode = NULL;
-		kern_unmount(balloon_mnt);
-		balloon_mnt = NULL;
-		return rc;
-	}
-
-	b_dev_info.inode->i_mapping->a_ops = &balloon_aops;
-	return 0;
-}
-static void cmm_balloon_compaction_deinit(void)
-{
-	if (b_dev_info.inode)
-		iput(b_dev_info.inode);
-	b_dev_info.inode = NULL;
-	kern_unmount(balloon_mnt);
-	balloon_mnt = NULL;
 }
 #else /* CONFIG_BALLOON_COMPACTION */
-static int cmm_balloon_compaction_init(void)
-{
-	return 0;
-}
-
-static void cmm_balloon_compaction_deinit(void)
+static void cmm_balloon_compaction_init(void)
 {
 }
 #endif /* CONFIG_BALLOON_COMPACTION */
@@ -622,9 +572,7 @@ static int cmm_init(void)
 	if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate)
 		return -EOPNOTSUPP;
 
-	rc = cmm_balloon_compaction_init();
-	if (rc)
-		return rc;
+	cmm_balloon_compaction_init();
 
 	rc = register_oom_notifier(&cmm_oom_nb);
 	if (rc < 0)
@@ -658,7 +606,6 @@ out_reboot_notifier:
 out_oom_notifier:
 	unregister_oom_notifier(&cmm_oom_nb);
 out_balloon_compaction:
-	cmm_balloon_compaction_deinit();
 	return rc;
 }
 
@@ -677,7 +624,6 @@ static void cmm_exit(void)
 	unregister_memory_notifier(&cmm_mem_nb);
 	cmm_free_pages(atomic_long_read(&loaned_pages));
 	cmm_unregister_sysfs(&cmm_dev);
-	cmm_balloon_compaction_deinit();
 }
 
 /**
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 086ce77d9074..85dd6aa33df6 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -29,8 +29,6 @@
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
@@ -1730,20 +1728,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
 
 
 #ifdef CONFIG_BALLOON_COMPACTION
-
-static int vmballoon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type vmballoon_fs = {
-	.name           	= "balloon-vmware",
-	.init_fs_context	= vmballoon_init_fs_context,
-	.kill_sb        	= kill_anon_super,
-};
-
-static struct vfsmount *vmballoon_mnt;
-
 /**
  * vmballoon_migratepage() - migrates a balloon page.
  * @b_dev_info: balloon device information descriptor.
@@ -1862,21 +1846,6 @@ out_unlock:
 	return ret;
 }
 
-/**
- * vmballoon_compaction_deinit() - removes compaction related data.
- *
- * @b: pointer to the balloon.
- */
-static void vmballoon_compaction_deinit(struct vmballoon *b)
-{
-	if (!IS_ERR(b->b_dev_info.inode))
-		iput(b->b_dev_info.inode);
-
-	b->b_dev_info.inode = NULL;
-	kern_unmount(vmballoon_mnt);
-	vmballoon_mnt = NULL;
-}
-
 /**
  * vmballoon_compaction_init() - initialized compaction for the balloon.
  *
@@ -1888,33 +1857,15 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
  *
  * Return: zero on success or error code on failure.
  */
-static __init int vmballoon_compaction_init(struct vmballoon *b)
+static __init void vmballoon_compaction_init(struct vmballoon *b)
 {
-	vmballoon_mnt = kern_mount(&vmballoon_fs);
-	if (IS_ERR(vmballoon_mnt))
-		return PTR_ERR(vmballoon_mnt);
-
 	b->b_dev_info.migratepage = vmballoon_migratepage;
-	b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb);
-
-	if (IS_ERR(b->b_dev_info.inode))
-		return PTR_ERR(b->b_dev_info.inode);
-
-	b->b_dev_info.inode->i_mapping->a_ops = &balloon_aops;
-	return 0;
 }
 
 #else /* CONFIG_BALLOON_COMPACTION */
-
-static void vmballoon_compaction_deinit(struct vmballoon *b)
-{
-}
-
-static int vmballoon_compaction_init(struct vmballoon *b)
+static inline void vmballoon_compaction_init(struct vmballoon *b)
 {
-	return 0;
 }
-
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static int __init vmballoon_init(void)
@@ -1939,9 +1890,7 @@ static int __init vmballoon_init(void)
 	 * balloon_devinfo_init() .
 	 */
 	balloon_devinfo_init(&balloon.b_dev_info);
-	error = vmballoon_compaction_init(&balloon);
-	if (error)
-		goto fail;
+	vmballoon_compaction_init(&balloon);
 
 	INIT_LIST_HEAD(&balloon.huge_pages);
 	spin_lock_init(&balloon.comm_lock);
@@ -1958,7 +1907,6 @@ static int __init vmballoon_init(void)
 	return 0;
 fail:
 	vmballoon_unregister_shrinker(&balloon);
-	vmballoon_compaction_deinit(&balloon);
 	return error;
 }
 
@@ -1985,8 +1933,5 @@ static void __exit vmballoon_exit(void)
 	 */
 	vmballoon_send_start(&balloon, 0);
 	vmballoon_pop(&balloon);
-
-	/* Only once we popped the balloon, compaction can be deinit */
-	vmballoon_compaction_deinit(&balloon);
 }
 module_exit(vmballoon_exit);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b9737da6c4dd..bd360b91e9d3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -17,9 +17,6 @@
 #include <linux/oom.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/magic.h>
-#include <linux/pseudo_fs.h>
 #include <linux/page_reporting.h>
 
 /*
@@ -42,10 +39,6 @@
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
 #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
 
-#ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-#endif
-
 enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
@@ -805,18 +798,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 
 	return MIGRATEPAGE_SUCCESS;
 }
-
-static int balloon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name           = "balloon-kvm",
-	.init_fs_context = balloon_init_fs_context,
-	.kill_sb        = kill_anon_super,
-};
-
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static unsigned long shrink_free_pages(struct virtio_balloon *vb,
@@ -909,19 +890,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		goto out_free_vb;
 
 #ifdef CONFIG_BALLOON_COMPACTION
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		err = PTR_ERR(balloon_mnt);
-		goto out_del_vqs;
-	}
-
 	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
-	if (IS_ERR(vb->vb_dev_info.inode)) {
-		err = PTR_ERR(vb->vb_dev_info.inode);
-		goto out_kern_unmount;
-	}
-	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
 		/*
@@ -930,13 +899,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		 */
 		if (virtqueue_get_vring_size(vb->free_page_vq) < 2) {
 			err = -ENOSPC;
-			goto out_iput;
+			goto out_del_vqs;
 		}
 		vb->balloon_wq = alloc_workqueue("balloon-wq",
 					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
 		if (!vb->balloon_wq) {
 			err = -ENOMEM;
-			goto out_iput;
+			goto out_del_vqs;
 		}
 		INIT_WORK(&vb->report_free_page_work, report_free_page_func);
 		vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP;
@@ -1030,13 +999,7 @@ out_unregister_shrinker:
 out_del_balloon_wq:
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
 		destroy_workqueue(vb->balloon_wq);
-out_iput:
-#ifdef CONFIG_BALLOON_COMPACTION
-	iput(vb->vb_dev_info.inode);
-out_kern_unmount:
-	kern_unmount(balloon_mnt);
 out_del_vqs:
-#endif
 	vdev->config->del_vqs(vdev);
 out_free_vb:
 	kfree(vb);
@@ -1083,12 +1046,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	}
 
 	remove_common(vb);
-#ifdef CONFIG_BALLOON_COMPACTION
-	if (vb->vb_dev_info.inode)
-		iput(vb->vb_dev_info.inode);
-
-	kern_unmount(balloon_mnt);
-#endif
 	kfree(vb);
 }
 
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index edb7f6d41faa..5ca2d5699620 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -57,7 +57,6 @@ struct balloon_dev_info {
 	struct list_head pages;		/* Pages enqueued & handled to Host */
 	int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
-	struct inode *inode;
 };
 
 extern struct page *balloon_page_alloc(void);
@@ -75,11 +74,10 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 	spin_lock_init(&balloon->pages_lock);
 	INIT_LIST_HEAD(&balloon->pages);
 	balloon->migratepage = NULL;
-	balloon->inode = NULL;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
-extern const struct address_space_operations balloon_aops;
+extern const struct movable_operations balloon_mops;
 
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
@@ -94,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	__SetPageMovable(page, balloon->inode->i_mapping);
+	__SetPageMovable(page, &balloon_mops);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..5d8ee3155ca2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -367,8 +367,6 @@ struct address_space_operations {
 	 */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
-	bool (*isolate_page)(struct page *, isolate_mode_t);
-	void (*putback_page)(struct page *);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate) (struct folio *, size_t from,
 			size_t count);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..82c735ba6109 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -19,6 +19,43 @@ struct migration_target_control;
  */
 #define MIGRATEPAGE_SUCCESS		0
 
+/**
+ * struct movable_operations - Driver page migration
+ * @isolate_page:
+ * The VM calls this function to prepare the page to be moved.  The page
+ * is locked and the driver should not unlock it.  The driver should
+ * return ``true`` if the page is movable and ``false`` if it is not
+ * currently movable.  After this function returns, the VM uses the
+ * page->lru field, so the driver must preserve any information which
+ * is usually stored here.
+ *
+ * @migrate_page:
+ * After isolation, the VM calls this function with the isolated
+ * @src page.  The driver should copy the contents of the
+ * @src page to the @dst page and set up the fields of @dst page.
+ * Both pages are locked.
+ * If page migration is successful, the driver should call
+ * __ClearPageMovable(@src) and return MIGRATEPAGE_SUCCESS.
+ * If the driver cannot migrate the page at the moment, it can return
+ * -EAGAIN.  The VM interprets this as a temporary migration failure and
+ * will retry it later.  Any other error value is a permanent migration
+ * failure and migration will not be retried.
+ * The driver shouldn't touch the @src->lru field while in the
+ * migrate_page() function.  It may write to @dst->lru.
+ *
+ * @putback_page:
+ * If migration fails on the isolated page, the VM informs the driver
+ * that the page is no longer a candidate for migration by calling
+ * this function.  The driver should put the isolated page back into
+ * its own data structure.
+ */
+struct movable_operations {
+	bool (*isolate_page)(struct page *, isolate_mode_t);
+	int (*migrate_page)(struct page *dst, struct page *src,
+			enum migrate_mode);
+	void (*putback_page)(struct page *);
+};
+
 /* Defined in mm/debug.c: */
 extern const char *migrate_reason_names[MR_TYPES];
 
@@ -91,13 +128,13 @@ static inline int next_demotion_node(int node)
 #endif
 
 #ifdef CONFIG_COMPACTION
-extern int PageMovable(struct page *page);
-extern void __SetPageMovable(struct page *page, struct address_space *mapping);
-extern void __ClearPageMovable(struct page *page);
+bool PageMovable(struct page *page);
+void __SetPageMovable(struct page *page, const struct movable_operations *ops);
+void __ClearPageMovable(struct page *page);
 #else
-static inline int PageMovable(struct page *page) { return 0; }
+static inline bool PageMovable(struct page *page) { return false; }
 static inline void __SetPageMovable(struct page *page,
-				struct address_space *mapping)
+		const struct movable_operations *ops)
 {
 }
 static inline void __ClearPageMovable(struct page *page)
@@ -110,6 +147,15 @@ static inline bool folio_test_movable(struct folio *folio)
 	return PageMovable(&folio->page);
 }
 
+static inline
+const struct movable_operations *page_movable_ops(struct page *page)
+{
+	VM_BUG_ON(!__PageMovable(page));
+
+	return (const struct movable_operations *)
+		((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE);
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 extern int migrate_misplaced_page(struct page *page,
 				  struct vm_area_struct *vma, int node);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..3f5490f6f038 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -639,7 +639,7 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
  * structure which KSM associates with that merged page.  See ksm.h.
  *
  * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
- * page and then page->mapping points a struct address_space.
+ * page and then page->mapping points to a struct movable_operations.
  *
  * Please note that, confusingly, "page_mapping" refers to the inode
  * address_space which maps the page from disk; whereas "page_mapped"
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f724129c0425..6325d1d0e90f 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -98,12 +98,8 @@
 
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
-#define BALLOON_KVM_MAGIC	0x13661366
-#define ZSMALLOC_MAGIC		0x58295829
 #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
-#define Z3FOLD_MAGIC		0x33
-#define PPC_CMM_MAGIC		0xc7571590
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 4b8eab4b3f45..22c96fed70b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page)
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 }
 
-
 /* move_to_new_page() counterpart for a ballooned page */
-static int balloon_page_migrate(struct address_space *mapping,
-		struct page *newpage, struct page *page,
+static int balloon_page_migrate(struct page *newpage, struct page *page,
 		enum migrate_mode mode)
 {
 	struct balloon_dev_info *balloon = balloon_page_device(page);
@@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping,
 	return balloon->migratepage(balloon, newpage, page, mode);
 }
 
-const struct address_space_operations balloon_aops = {
-	.migratepage = balloon_page_migrate,
+const struct movable_operations balloon_mops = {
+	.migrate_page = balloon_page_migrate,
 	.isolate_page = balloon_page_isolate,
 	.putback_page = balloon_page_putback,
 };
-EXPORT_SYMBOL_GPL(balloon_aops);
+EXPORT_SYMBOL_GPL(balloon_mops);
 
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f89b969c12b..f23efba1d118 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list)
 }
 
 #ifdef CONFIG_COMPACTION
-
-int PageMovable(struct page *page)
+bool PageMovable(struct page *page)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (!__PageMovable(page))
-		return 0;
+		return false;
 
-	mapping = page_mapping(page);
-	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
-		return 1;
+	mops = page_movable_ops(page);
+	if (mops)
+		return true;
 
-	return 0;
+	return false;
 }
 EXPORT_SYMBOL(PageMovable);
 
-void __SetPageMovable(struct page *page, struct address_space *mapping)
+void __SetPageMovable(struct page *page, const struct movable_operations *mops)
 {
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
-	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+	VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
+	page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
 }
 EXPORT_SYMBOL(__SetPageMovable);
 
@@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	/*
-	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
-	 * flag so that VM can catch up released page by driver after isolation.
-	 * With it, VM migration doesn't try to put it back.
+	 * This page still has the type of a movable page, but it's
+	 * actually not movable any more.
 	 */
-	page->mapping = (void *)((unsigned long)page->mapping &
-				PAGE_MAPPING_MOVABLE);
+	page->mapping = (void *)PAGE_MAPPING_MOVABLE;
 }
 EXPORT_SYMBOL(__ClearPageMovable);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c1ea61f39d8..491f03747832 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -59,7 +59,7 @@
 
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops;
 
 	/*
 	 * Avoid burning cycles with pages that are yet under __free_pages(),
@@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
 	if (!PageMovable(page) || PageIsolated(page))
 		goto out_no_isolated;
 
-	mapping = page_mapping(page);
-	VM_BUG_ON_PAGE(!mapping, page);
+	mops = page_movable_ops(page);
+	VM_BUG_ON_PAGE(!mops, page);
 
-	if (!mapping->a_ops->isolate_page(page, mode))
+	if (!mops->isolate_page(page, mode))
 		goto out_no_isolated;
 
 	/* Driver shouldn't use PG_isolated bit of page->flags */
@@ -120,10 +120,9 @@ out:
 
 static void putback_movable_page(struct page *page)
 {
-	struct address_space *mapping;
+	const struct movable_operations *mops = page_movable_ops(page);
 
-	mapping = page_mapping(page);
-	mapping->a_ops->putback_page(page);
+	mops->putback_page(page);
 	ClearPageIsolated(page);
 }
 
@@ -846,16 +845,15 @@ static int fallback_migrate_page(struct address_space *mapping,
 static int move_to_new_folio(struct folio *dst, struct folio *src,
 				enum migrate_mode mode)
 {
-	struct address_space *mapping;
 	int rc = -EAGAIN;
 	bool is_lru = !__PageMovable(&src->page);
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
 
-	mapping = folio_mapping(src);
-
 	if (likely(is_lru)) {
+		struct address_space *mapping = folio_mapping(src);
+
 		if (!mapping)
 			rc = migrate_page(mapping, &dst->page, &src->page, mode);
 		else if (mapping->a_ops->migratepage)
@@ -872,6 +870,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			rc = fallback_migrate_page(mapping, &dst->page,
 							&src->page, mode);
 	} else {
+		const struct movable_operations *mops;
+
 		/*
 		 * In case of non-lru page, it could be released after
 		 * isolation step. In that case, we shouldn't try migration.
@@ -883,8 +883,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			goto out;
 		}
 
-		rc = mapping->a_ops->migratepage(mapping, &dst->page,
-						&src->page, mode);
+		mops = page_movable_ops(&src->page);
+		rc = mops->migrate_page(&dst->page, &src->page, mode);
 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
 				!folio_test_isolated(src));
 	}
diff --git a/mm/util.c b/mm/util.c
index 0837570c9225..53af0e79d3e4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio)
 		return swap_address_space(folio_swap_entry(folio));
 
 	mapping = folio->mapping;
-	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
 		return NULL;
 
-	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
+	return mapping;
 }
 EXPORT_SYMBOL(folio_mapping);
 
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f41f8b0d9e9a..cf71da10d04e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,15 +34,11 @@
 #include <linux/node.h>
 #include <linux/compaction.h>
 #include <linux/percpu.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/fs.h>
 #include <linux/preempt.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
-#include <linux/magic.h>
 #include <linux/kmemleak.h>
 
 /*
@@ -149,7 +145,6 @@ struct z3fold_header {
  * @compact_wq:	workqueue for page layout background optimization
  * @release_wq:	workqueue for safe page release
  * @work:	work_struct for safe page release
- * @inode:	inode for z3fold pseudo filesystem
  *
  * This structure is allocated at pool creation time and maintains metadata
  * pertaining to a particular z3fold pool.
@@ -169,7 +164,6 @@ struct z3fold_pool {
 	struct workqueue_struct *compact_wq;
 	struct workqueue_struct *release_wq;
 	struct work_struct work;
-	struct inode *inode;
 };
 
 /*
@@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
 	}
 }
 
-static int z3fold_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type z3fold_fs = {
-	.name		= "z3fold",
-	.init_fs_context = z3fold_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static struct vfsmount *z3fold_mnt;
-static int __init z3fold_mount(void)
-{
-	int ret = 0;
-
-	z3fold_mnt = kern_mount(&z3fold_fs);
-	if (IS_ERR(z3fold_mnt))
-		ret = PTR_ERR(z3fold_mnt);
-
-	return ret;
-}
-
-static void z3fold_unmount(void)
-{
-	kern_unmount(z3fold_mnt);
-}
-
-static const struct address_space_operations z3fold_aops;
-static int z3fold_register_migration(struct z3fold_pool *pool)
-{
-	pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
-	if (IS_ERR(pool->inode)) {
-		pool->inode = NULL;
-		return 1;
-	}
-
-	pool->inode->i_mapping->private_data = pool;
-	pool->inode->i_mapping->a_ops = &z3fold_aops;
-	return 0;
-}
-
-static void z3fold_unregister_migration(struct z3fold_pool *pool)
-{
-	if (pool->inode)
-		iput(pool->inode);
-}
-
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
 					struct z3fold_pool *pool, gfp_t gfp)
@@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
 	pool->release_wq = create_singlethread_workqueue(pool->name);
 	if (!pool->release_wq)
 		goto out_wq;
-	if (z3fold_register_migration(pool))
-		goto out_rwq;
 	INIT_WORK(&pool->work, free_pages_work);
 	pool->ops = ops;
 	return pool;
 
-out_rwq:
-	destroy_workqueue(pool->release_wq);
 out_wq:
 	destroy_workqueue(pool->compact_wq);
 out_unbuddied:
@@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
 
 	destroy_workqueue(pool->compact_wq);
 	destroy_workqueue(pool->release_wq);
-	z3fold_unregister_migration(pool);
 	free_percpu(pool->unbuddied);
 	kfree(pool);
 }
 
+static const struct movable_operations z3fold_mops;
+
 /**
  * z3fold_alloc() - allocates a region of a given size
  * @pool:	z3fold pool from which to allocate
@@ -1117,11 +1060,11 @@ retry:
 	}
 	if (can_sleep) {
 		lock_page(page);
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &z3fold_mops);
 		unlock_page(page);
 	} else {
 		WARN_ON(!trylock_page(page));
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &z3fold_mops);
 		unlock_page(page);
 	}
 	z3fold_page_lock(zhdr);
@@ -1554,12 +1497,11 @@ out:
 	return false;
 }
 
-static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
-			       struct page *page, enum migrate_mode mode)
+static int z3fold_page_migrate(struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	struct z3fold_header *zhdr, *new_zhdr;
 	struct z3fold_pool *pool;
-	struct address_space *new_mapping;
 
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
@@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
 	 * so we only have to reinitialize it.
 	 */
 	INIT_LIST_HEAD(&new_zhdr->buddy);
-	new_mapping = page_mapping(page);
 	__ClearPageMovable(page);
 
 	get_page(newpage);
@@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
 	spin_lock(&pool->lock);
 	list_add(&newpage->lru, &pool->lru);
 	spin_unlock(&pool->lock);
-	__SetPageMovable(newpage, new_mapping);
+	__SetPageMovable(newpage, &z3fold_mops);
 	z3fold_page_unlock(new_zhdr);
 
 	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
@@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page)
 	z3fold_page_unlock(zhdr);
 }
 
-static const struct address_space_operations z3fold_aops = {
+static const struct movable_operations z3fold_mops = {
 	.isolate_page = z3fold_page_isolate,
-	.migratepage = z3fold_page_migrate,
+	.migrate_page = z3fold_page_migrate,
 	.putback_page = z3fold_page_putback,
 };
 
@@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold");
 
 static int __init init_z3fold(void)
 {
-	int ret;
-
 	/*
 	 * Make sure the z3fold header is not larger than the page size and
 	 * there has remaining spaces for its buddy.
 	 */
 	BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
-	ret = z3fold_mount();
-	if (ret)
-		return ret;
-
 	zpool_register_driver(&z3fold_zpool_driver);
 
 	return 0;
@@ -1764,7 +1699,6 @@ static int __init init_z3fold(void)
 
 static void __exit exit_z3fold(void)
 {
-	z3fold_unmount();
 	zpool_unregister_driver(&z3fold_zpool_driver);
 }
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d5fc04385b8..71d6edcbea48 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/magic.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -59,8 +58,6 @@
 #include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
 #include <linux/migrate.h>
 #include <linux/wait.h>
 #include <linux/pagemap.h>
@@ -177,10 +174,6 @@ struct zs_size_stat {
 static struct dentry *zs_stat_root;
 #endif
 
-#ifdef CONFIG_COMPACTION
-static struct vfsmount *zsmalloc_mnt;
-#endif
-
 /*
  * We assign a page to ZS_ALMOST_EMPTY fullness group when:
  *	n <= N / f, where
@@ -252,7 +245,6 @@ struct zs_pool {
 	struct dentry *stat_dentry;
 #endif
 #ifdef CONFIG_COMPACTION
-	struct inode *inode;
 	struct work_struct free_work;
 #endif
 	/* protect page/zspage migration */
@@ -271,6 +263,7 @@ struct zspage {
 	unsigned int freeobj;
 	struct page *first_page;
 	struct list_head list; /* fullness list */
+	struct zs_pool *pool;
 #ifdef CONFIG_COMPACTION
 	rwlock_t lock;
 #endif
@@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage)
 }
 
 #ifdef CONFIG_COMPACTION
-static int zs_register_migration(struct zs_pool *pool);
-static void zs_unregister_migration(struct zs_pool *pool);
 static void migrate_lock_init(struct zspage *zspage);
 static void migrate_read_lock(struct zspage *zspage);
 static void migrate_read_unlock(struct zspage *zspage);
@@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool);
 static void init_deferred_free(struct zs_pool *pool);
 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
 #else
-static int zsmalloc_mount(void) { return 0; }
-static void zsmalloc_unmount(void) {}
-static int zs_register_migration(struct zs_pool *pool) { return 0; }
-static void zs_unregister_migration(struct zs_pool *pool) {}
 static void migrate_lock_init(struct zspage *zspage) {}
 static void migrate_read_lock(struct zspage *zspage) {}
 static void migrate_read_unlock(struct zspage *zspage) {}
@@ -1083,6 +1070,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 
 	create_page_chain(class, zspage, pages);
 	init_zspage(class, zspage);
+	zspage->pool = pool;
 
 	return zspage;
 }
@@ -1754,33 +1742,6 @@ static void lock_zspage(struct zspage *zspage)
 	migrate_read_unlock(zspage);
 }
 
-static int zs_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type zsmalloc_fs = {
-	.name		= "zsmalloc",
-	.init_fs_context = zs_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static int zsmalloc_mount(void)
-{
-	int ret = 0;
-
-	zsmalloc_mnt = kern_mount(&zsmalloc_fs);
-	if (IS_ERR(zsmalloc_mnt))
-		ret = PTR_ERR(zsmalloc_mnt);
-
-	return ret;
-}
-
-static void zsmalloc_unmount(void)
-{
-	kern_unmount(zsmalloc_mnt);
-}
-
 static void migrate_lock_init(struct zspage *zspage)
 {
 	rwlock_init(&zspage->lock);
@@ -1823,6 +1784,8 @@ static void dec_zspage_isolation(struct zspage *zspage)
 	zspage->isolated--;
 }
 
+static const struct movable_operations zsmalloc_mops;
+
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct page *newpage, struct page *oldpage)
 {
@@ -1843,7 +1806,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 	set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
 	if (unlikely(ZsHugePage(zspage)))
 		newpage->index = oldpage->index;
-	__SetPageMovable(newpage, page_mapping(oldpage));
+	__SetPageMovable(newpage, &zsmalloc_mops);
 }
 
 static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1865,8 +1828,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	return true;
 }
 
-static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
-		struct page *page, enum migrate_mode mode)
+static int zs_page_migrate(struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	struct zs_pool *pool;
 	struct size_class *class;
@@ -1889,14 +1852,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
-	pool = mapping->private_data;
+	/* The page is locked, so this pointer must remain valid */
+	zspage = get_zspage(page);
+	pool = zspage->pool;
 
 	/*
 	 * The pool migrate_lock protects the race between zpage migration
 	 * and zs_free.
 	 */
 	write_lock(&pool->migrate_lock);
-	zspage = get_zspage(page);
 	class = zspage_class(pool, zspage);
 
 	/*
@@ -1964,31 +1928,12 @@ static void zs_page_putback(struct page *page)
 	migrate_write_unlock(zspage);
 }
 
-static const struct address_space_operations zsmalloc_aops = {
+static const struct movable_operations zsmalloc_mops = {
 	.isolate_page = zs_page_isolate,
-	.migratepage = zs_page_migrate,
+	.migrate_page = zs_page_migrate,
 	.putback_page = zs_page_putback,
 };
 
-static int zs_register_migration(struct zs_pool *pool)
-{
-	pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
-	if (IS_ERR(pool->inode)) {
-		pool->inode = NULL;
-		return 1;
-	}
-
-	pool->inode->i_mapping->private_data = pool;
-	pool->inode->i_mapping->a_ops = &zsmalloc_aops;
-	return 0;
-}
-
-static void zs_unregister_migration(struct zs_pool *pool)
-{
-	flush_work(&pool->free_work);
-	iput(pool->inode);
-}
-
 /*
  * Caller should hold page_lock of all pages in the zspage
  * In here, we cannot use zspage meta data.
@@ -2032,6 +1977,11 @@ static void kick_deferred_free(struct zs_pool *pool)
 	schedule_work(&pool->free_work);
 }
 
+static void zs_flush_migration(struct zs_pool *pool)
+{
+	flush_work(&pool->free_work);
+}
+
 static void init_deferred_free(struct zs_pool *pool)
 {
 	INIT_WORK(&pool->free_work, async_free_zspage);
@@ -2043,10 +1993,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
 
 	do {
 		WARN_ON(!trylock_page(page));
-		__SetPageMovable(page, pool->inode->i_mapping);
+		__SetPageMovable(page, &zsmalloc_mops);
 		unlock_page(page);
 	} while ((page = get_next_page(page)) != NULL);
 }
+#else
+static inline void zs_flush_migration(struct zs_pool *pool) { }
 #endif
 
 /*
@@ -2324,9 +2276,6 @@ struct zs_pool *zs_create_pool(const char *name)
 	/* debug only, don't abort if it fails */
 	zs_pool_stat_create(pool, name);
 
-	if (zs_register_migration(pool))
-		goto err;
-
 	/*
 	 * Not critical since shrinker is only used to trigger internal
 	 * defragmentation of the pool which is pretty optional thing.  If
@@ -2348,7 +2297,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 	int i;
 
 	zs_unregister_shrinker(pool);
-	zs_unregister_migration(pool);
+	zs_flush_migration(pool);
 	zs_pool_stat_destroy(pool);
 
 	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
@@ -2380,14 +2329,10 @@ static int __init zs_init(void)
 {
 	int ret;
 
-	ret = zsmalloc_mount();
-	if (ret)
-		goto out;
-
 	ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
 				zs_cpu_prepare, zs_cpu_dead);
 	if (ret)
-		goto hp_setup_fail;
+		goto out;
 
 #ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zs_zpool_driver);
@@ -2397,8 +2342,6 @@ static int __init zs_init(void)
 
 	return 0;
 
-hp_setup_fail:
-	zsmalloc_unmount();
 out:
 	return ret;
 }
@@ -2408,7 +2351,6 @@ static void __exit zs_exit(void)
 #ifdef CONFIG_ZPOOL
 	zpool_unregister_driver(&zs_zpool_driver);
 #endif
-	zsmalloc_unmount();
 	cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
 
 	zs_stat_exit();
-- 
cgit v1.2.3


From 0aa73170eba5eae638c1b96a05eba533f030b5cb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 30 Jul 2022 17:27:49 +0800
Subject: ublk_drv: add SET_PARAMS/GET_PARAMS control command

Add two commands to set/get parameters generically.

One important goal of ublk is to provide generic framework for making
block device by userspace flexibly.

As one generic block device, there are still lots of block parameters,
such as max_sectors, write_cache/fua, discard related limits,
zoned parameters, ...., so this patch starts to add generic mechanism
for set/get device parameters.

Both generic block parameters(all kinds of queue settings) and ublk
feature parameters can be covered with this way, then it becomes quite
easy to extend in future.

Add two parameter types are used so far: basic(covers basic queue setting
and misc settings which can't be grouped easily) and discard, basic type
must be set, and discard type becomes optional now

This way provides mechanism to simulate any kind of generic block device
from userspace easily, from both block queue setting viewpoint or ublk
feature viewpoint.

The style of putting all parameters together is suggested by Christoph.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220730092750.1118167-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 205 ++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/ublk_cmd.h |  47 ++++++++++
 2 files changed, 234 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index ae98e81b21ce..20ad83b25318 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -49,6 +49,9 @@
 /* All UBLK_F_* have to be included into UBLK_F_ALL */
 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
 
+/* All UBLK_PARAM_TYPE_* should be included here */
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
+
 struct ublk_rq_data {
 	struct callback_head work;
 };
@@ -137,6 +140,8 @@ struct ublk_device {
 	spinlock_t		mm_lock;
 	struct mm_struct	*mm;
 
+	struct ublk_params	params;
+
 	struct completion	completion;
 	unsigned int		nr_queues_ready;
 	atomic_t		nr_aborted_queues;
@@ -149,6 +154,12 @@ struct ublk_device {
 	struct work_struct	stop_work;
 };
 
+/* header of ublk_params */
+struct ublk_params_header {
+	__u32	len;
+	__u32	types;
+};
+
 static dev_t ublk_chr_devt;
 static struct class *ublk_chr_class;
 
@@ -160,6 +171,91 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static void ublk_dev_param_basic_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
+	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
+	blk_queue_io_min(q, 1 << p->io_min_shift);
+	blk_queue_io_opt(q, 1 << p->io_opt_shift);
+
+	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
+			p->attrs & UBLK_ATTR_FUA);
+	if (p->attrs & UBLK_ATTR_ROTATIONAL)
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+
+	blk_queue_max_hw_sectors(q, p->max_sectors);
+	blk_queue_chunk_sectors(q, p->chunk_sectors);
+	blk_queue_virt_boundary(q, p->virt_boundary_mask);
+
+	if (p->attrs & UBLK_ATTR_READ_ONLY)
+		set_disk_ro(ub->ub_disk, true);
+
+	set_capacity(ub->ub_disk, p->dev_sectors);
+}
+
+static void ublk_dev_param_discard_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_discard *p = &ub->params.discard;
+
+	q->limits.discard_alignment = p->discard_alignment;
+	q->limits.discard_granularity = p->discard_granularity;
+	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
+	blk_queue_max_write_zeroes_sectors(q,
+			p->max_write_zeroes_sectors);
+	blk_queue_max_discard_segments(q, p->max_discard_segments);
+}
+
+static int ublk_validate_params(const struct ublk_device *ub)
+{
+	/* basic param is the only one which must be set */
+	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
+		const struct ublk_param_basic *p = &ub->params.basic;
+
+		if (p->logical_bs_shift > PAGE_SHIFT)
+			return -EINVAL;
+
+		if (p->logical_bs_shift > p->physical_bs_shift)
+			return -EINVAL;
+
+		if (p->max_sectors > (ub->dev_info.rq_max_blocks <<
+					(ub->bs_shift - 9)))
+			return -EINVAL;
+	} else
+		return -EINVAL;
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+		const struct ublk_param_discard *p = &ub->params.discard;
+
+		/* So far, only support single segment discard */
+		if (p->max_discard_sectors && p->max_discard_segments != 1)
+			return -EINVAL;
+
+		if (!p->discard_granularity)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ublk_apply_params(struct ublk_device *ub)
+{
+	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+		return -EINVAL;
+
+	ublk_dev_param_basic_apply(ub);
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
+		ublk_dev_param_discard_apply(ub);
+
+	return 0;
+}
+
 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
 {
 	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
@@ -1138,7 +1234,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 {
 	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
 	int ublksrv_pid = (int)header->data[0];
-	unsigned long dev_blocks = header->data[1];
 	struct ublk_device *ub;
 	struct gendisk *disk;
 	int ret = -EINVAL;
@@ -1161,10 +1256,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 		goto out_unlock;
 	}
 
-	/* We may get disk size updated */
-	if (dev_blocks)
-		ub->dev_info.dev_blocks = dev_blocks;
-
 	disk = blk_mq_alloc_disk(&ub->tag_set, ub);
 	if (IS_ERR(disk)) {
 		ret = PTR_ERR(disk);
@@ -1174,19 +1265,13 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 	disk->fops = &ub_fops;
 	disk->private_data = ub;
 
-	blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
-	blk_queue_io_min(disk->queue, ub->dev_info.block_size);
-	blk_queue_max_hw_sectors(disk->queue,
-		ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
-	disk->queue->limits.discard_granularity = PAGE_SIZE;
-	blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
-
-	set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
-
 	ub->dev_info.ublksrv_pid = ublksrv_pid;
 	ub->ub_disk = disk;
+
+	ret = ublk_apply_params(ub);
+	if (ret)
+		goto out_put_disk;
+
 	get_device(&ub->cdev_dev);
 	ret = add_disk(disk);
 	if (ret) {
@@ -1195,11 +1280,13 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
 		 * called in case of add_disk failure.
 		 */
 		ublk_put_device(ub);
-		put_disk(disk);
-		goto out_unlock;
+		goto out_put_disk;
 	}
 	set_bit(UB_STATE_USED, &ub->state);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
+out_put_disk:
+	if (ret)
+		put_disk(disk);
 out_unlock:
 	mutex_unlock(&ub->mutex);
 	ublk_put_device(ub);
@@ -1447,6 +1534,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	if (copy_to_user(argp, &ub->params, ph.len))
+		ret = -EFAULT;
+	else
+		ret = 0;
+	mutex_unlock(&ub->mutex);
+
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	struct ublk_device *ub;
+	int ret = -EFAULT;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len || !ph.types)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	/* parameters can only be changed when device isn't live */
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		ret = -EACCES;
+	} else if (copy_from_user(&ub->params, argp, ph.len)) {
+		ret = -EFAULT;
+	} else {
+		/* clear all we don't support yet */
+		ub->params.types &= UBLK_PARAM_TYPE_ALL;
+		ret = ublk_validate_params(ub);
+	}
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+
+	return ret;
+}
+
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -1482,6 +1645,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_GET_QUEUE_AFFINITY:
 		ret = ublk_ctrl_get_queue_affinity(cmd);
 		break;
+	case UBLK_CMD_GET_PARAMS:
+		ret = ublk_ctrl_get_params(cmd);
+		break;
+	case UBLK_CMD_SET_PARAMS:
+		ret = ublk_ctrl_set_params(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ca33092354ab..54d065426f06 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -15,6 +15,8 @@
 #define	UBLK_CMD_DEL_DEV		0x05
 #define	UBLK_CMD_START_DEV	0x06
 #define	UBLK_CMD_STOP_DEV	0x07
+#define	UBLK_CMD_SET_PARAMS	0x08
+#define	UBLK_CMD_GET_PARAMS	0x09
 
 /*
  * IO commands, issued by ublk server, and handled by ublk driver.
@@ -158,4 +160,49 @@ struct ublksrv_io_cmd {
 	__u64	addr;
 };
 
+struct ublk_param_basic {
+#define UBLK_ATTR_READ_ONLY            (1 << 0)
+#define UBLK_ATTR_ROTATIONAL           (1 << 1)
+#define UBLK_ATTR_VOLATILE_CACHE       (1 << 2)
+#define UBLK_ATTR_FUA                  (1 << 3)
+	__u32	attrs;
+	__u8	logical_bs_shift;
+	__u8	physical_bs_shift;
+	__u8	io_opt_shift;
+	__u8	io_min_shift;
+
+	__u32	max_sectors;
+	__u32	chunk_sectors;
+
+	__u64   dev_sectors;
+	__u64   virt_boundary_mask;
+};
+
+struct ublk_param_discard {
+	__u32	discard_alignment;
+
+	__u32	discard_granularity;
+	__u32	max_discard_sectors;
+
+	__u32	max_write_zeroes_sectors;
+	__u16	max_discard_segments;
+	__u16	reserved0;
+};
+
+struct ublk_params {
+	/*
+	 * Total length of parameters, userspace has to set 'len' for both
+	 * SET_PARAMS and GET_PARAMS command, and driver may update len
+	 * if two sides use different version of 'ublk_params', same with
+	 * 'types' fields.
+	 */
+	__u32	len;
+#define UBLK_PARAM_TYPE_BASIC           (1 << 0)
+#define UBLK_PARAM_TYPE_DISCARD         (1 << 1)
+	__u32	types;			/* types of parameter included */
+
+	struct ublk_param_basic		basic;
+	struct ublk_param_discard	discard;
+};
+
 #endif
-- 
cgit v1.2.3


From 4bf9cbf3e93426e9ebe136dabd6ca392ca92cfcb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 30 Jul 2022 17:27:50 +0800
Subject: ublk_drv: cleanup ublksrv_ctrl_dev_info

Remove all block device related info from ublksrv_ctrl_dev_info,
meantime reduce its size into 64 bytes because:

1) ublksrv_ctrl_dev_info becomes cleaner without including any
block related info

2) generic set/get parameter command can be used to set block
related setting easily and cleanly

3) generic set/get parameter command can be used for extending
ublk without needing more info in ublksrv_ctrl_dev_info

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220730092750.1118167-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 18 +++++++-----------
 include/uapi/linux/ublk_cmd.h | 15 ++++++++-------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 20ad83b25318..2b3cd671a653 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -122,7 +122,6 @@ struct ublk_device {
 	char	*__queues;
 
 	unsigned short  queue_size;
-	unsigned short  bs_shift;
 	struct ublksrv_ctrl_dev_info	dev_info;
 
 	struct blk_mq_tag_set	tag_set;
@@ -223,8 +222,7 @@ static int ublk_validate_params(const struct ublk_device *ub)
 		if (p->logical_bs_shift > p->physical_bs_shift)
 			return -EINVAL;
 
-		if (p->max_sectors > (ub->dev_info.rq_max_blocks <<
-					(ub->bs_shift - 9)))
+		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
 			return -EINVAL;
 	} else
 		return -EINVAL;
@@ -1185,13 +1183,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
 	ublk_stop_dev(ub);
 }
 
-/* align maximum I/O size to PAGE_SIZE */
+/* align max io buffer size with PAGE_SIZE */
 static void ublk_align_max_io_size(struct ublk_device *ub)
 {
-	unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
+	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
 
-	ub->dev_info.rq_max_blocks =
-		round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
+	ub->dev_info.max_io_buf_bytes =
+		round_down(max_io_bytes, PAGE_SIZE);
 }
 
 static int ublk_add_tag_set(struct ublk_device *ub)
@@ -1348,9 +1346,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 {
 	pr_devel("%s: dev id %d flags %llx\n", __func__,
 			info->dev_id, info->flags);
-	pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
-			info->nr_hw_queues, info->queue_depth,
-			info->block_size, info->dev_blocks);
+	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
+			info->nr_hw_queues, info->queue_depth);
 }
 
 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
@@ -1410,7 +1407,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 	/* We are not ready to support zero copy */
 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
 
-	ub->bs_shift = ilog2(ub->dev_info.block_size);
 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
 	ublk_align_max_io_size(ub);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 54d065426f06..57d86d0e8c5b 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -80,22 +80,23 @@ struct ublksrv_ctrl_cmd {
 struct ublksrv_ctrl_dev_info {
 	__u16	nr_hw_queues;
 	__u16	queue_depth;
-	__u16	block_size;
 	__u16	state;
+	__u16	pad0;
 
-	__u32	rq_max_blocks;
+	__u32	max_io_buf_bytes;
 	__u32	dev_id;
 
-	__u64   dev_blocks;
-
 	__s32	ublksrv_pid;
-	__s32	reserved0;
+	__u32	pad1;
+
 	__u64	flags;
-	__u64	flags_reserved;
 
 	/* For ublksrv internal use, invisible to ublk driver */
 	__u64	ublksrv_flags;
-	__u64	reserved1[9];
+
+	__u64	reserved0;
+	__u64	reserved1;
+	__u64   reserved2;
 };
 
 #define		UBLK_IO_OP_READ		0
-- 
cgit v1.2.3


From 4e18403d9485a43e1b54397df258b8df7dac9a83 Mon Sep 17 00:00:00 2001
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Date: Thu, 28 Jul 2022 20:39:15 +0800
Subject: ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA

Add one new ublk command: UBLK_IO_NEED_GET_DATA. It is prepared for a new
feature designed for a user application who wants to allocate IO buffer
and set IO buffer address only after it receives an IO request from
ublksrv.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Link: https://lore.kernel.org/r/c8a64b6b51c78340da7daa9e1054608695e79619.1659011443.git.ZiyangZhang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ublk_cmd.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 57d86d0e8c5b..677edaab2b66 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -30,12 +30,21 @@
  *      this IO request, request's handling result is committed to ublk
  *      driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
  *      handled before completing io request.
+ *
+ * NEED_GET_DATA: only used for write requests to set io addr and copy data
+ *      When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
+ *      command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
+ *
+ *      It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
+ *      while starting a ublk device.
  */
 #define	UBLK_IO_FETCH_REQ		0x20
 #define	UBLK_IO_COMMIT_AND_FETCH_REQ	0x21
+#define	UBLK_IO_NEED_GET_DATA	0x22
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
+#define UBLK_IO_RES_NEED_GET_DATA	1
 #define UBLK_IO_RES_ABORT		(-ENODEV)
 
 #define UBLKSRV_CMD_BUF_OFFSET	0
@@ -56,6 +65,15 @@
  */
 #define UBLK_F_URING_CMD_COMP_IN_TASK	(1ULL << 1)
 
+/*
+ * User should issue io cmd again for write requests to
+ * set io buffer address and copy data from bio vectors
+ * to the userspace io buffer.
+ *
+ * In this mode, task_work is not used.
+ */
+#define UBLK_F_NEED_GET_DATA (1UL << 2)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From 36d763509be326bb383b1b1852a129ff58d74e3b Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Wed, 27 Jul 2022 17:40:53 +0200
Subject: xfrm: fix XFRMA_LASTUSED comment

It is a __u64, internally time64_t.

Fixes: bf825f81b454 ("xfrm: introduce basic mark infrastructure")
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 65e13a099b1a..a9f5d884560a 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -296,7 +296,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ETIMER_THRESH,
 	XFRMA_SRCADDR,		/* xfrm_address_t */
 	XFRMA_COADDR,		/* xfrm_address_t */
-	XFRMA_LASTUSED,		/* unsigned long  */
+	XFRMA_LASTUSED,		/* __u64 */
 	XFRMA_POLICY_TYPE,	/* struct xfrm_userpolicy_type */
 	XFRMA_MIGRATE,
 	XFRMA_ALG_AEAD,		/* struct xfrm_algo_aead */
-- 
cgit v1.2.3


From 23339e5752d01a4b5e122759b002cf896d26f6c1 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Mon, 1 Aug 2022 10:08:08 -0700
Subject: f2fs: revive F2FS_IOC_ABORT_VOLATILE_WRITE

F2FS_IOC_ABORT_VOLATILE_WRITE was used to abort a atomic write before.
However it was removed accidentally. So revive it by changing the name,
since volatile write had gone.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Fiexes: 7bc155fec5b3("f2fs: kill volatile write support")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c            | 30 ++++++++++++++++++++++++++++--
 include/uapi/linux/f2fs.h |  2 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8098ed890e94..29711e5b6983 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2109,6 +2109,31 @@ unlock_out:
 	return ret;
 }
 
+static int f2fs_ioc_abort_atomic_write(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
+	int ret;
+
+	if (!inode_owner_or_capable(mnt_userns, inode))
+		return -EACCES;
+
+	ret = mnt_want_write_file(filp);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+
+	if (f2fs_is_atomic_file(inode))
+		f2fs_abort_atomic_write(inode, true);
+
+	inode_unlock(inode);
+
+	mnt_drop_write_file(filp);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	return ret;
+}
+
 static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -4054,9 +4079,10 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_start_atomic_write(filp);
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 		return f2fs_ioc_commit_atomic_write(filp);
+	case F2FS_IOC_ABORT_ATOMIC_WRITE:
+		return f2fs_ioc_abort_atomic_write(filp);
 	case F2FS_IOC_START_VOLATILE_WRITE:
 	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
-	case F2FS_IOC_ABORT_VOLATILE_WRITE:
 		return -EOPNOTSUPP;
 	case F2FS_IOC_SHUTDOWN:
 		return f2fs_ioc_shutdown(filp, arg);
@@ -4725,7 +4751,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 	case F2FS_IOC_START_VOLATILE_WRITE:
 	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
-	case F2FS_IOC_ABORT_VOLATILE_WRITE:
+	case F2FS_IOC_ABORT_ATOMIC_WRITE:
 	case F2FS_IOC_SHUTDOWN:
 	case FITRIM:
 	case FS_IOC_SET_ENCRYPTION_POLICY:
diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h
index 352a822d4370..3121d127d5aa 100644
--- a/include/uapi/linux/f2fs.h
+++ b/include/uapi/linux/f2fs.h
@@ -13,7 +13,7 @@
 #define F2FS_IOC_COMMIT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 2)
 #define F2FS_IOC_START_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 3)
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 4)
-#define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_ABORT_ATOMIC_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IOW(F2FS_IOCTL_MAGIC, 6, __u32)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
 #define F2FS_IOC_DEFRAGMENT		_IOWR(F2FS_IOCTL_MAGIC, 8,	\
-- 
cgit v1.2.3


From ca34ce29fc4b0e929cc6aada40829d17ab50fee4 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 8 Aug 2022 09:47:23 -0700
Subject: bpf: Improve docstring for BPF_F_USER_BUILD_ID flag

Most tools which use bpf_get_stack or bpf_get_stackid symbolicate the
stack - meaning the stack of addresses in the target process' address
space is transformed into meaningful symbol names. The
BPF_F_USER_BUILD_ID flag eases this process by finding the build_id of
the file-backed vma which the address falls in and translating the
address to an offset within the backing file.

To be more specific, the offset is a "file offset" from the beginning of
the backing file. The symbols in ET_DYN ELF objects have a st_value
which is also described as an "offset" - but an offset in the process
address space, relative to the base address of the object.

It's necessary to translate between the "file offset" and "virtual
address offset" during symbolication before they can be directly
compared. Failure to do so can lead to confusing bugs, so this patch
clarifies language in the documentation in an attempt to keep this from
happening.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220808164723.3107500-1-davemarchevsky@fb.com
---
 include/uapi/linux/bpf.h       | 14 ++++++++++++--
 tools/include/uapi/linux/bpf.h | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7bf9ba1329be..534e33fb1029 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3008,8 +3008,18 @@ union bpf_attr {
  * 		**BPF_F_USER_STACK**
  * 			Collect a user space stack instead of a kernel stack.
  * 		**BPF_F_USER_BUILD_ID**
- * 			Collect buildid+offset instead of ips for user stack,
- * 			only valid if **BPF_F_USER_STACK** is also specified.
+ * 			Collect (build_id, file_offset) instead of ips for user
+ * 			stack, only valid if **BPF_F_USER_STACK** is also
+ * 			specified.
+ *
+ * 			*file_offset* is an offset relative to the beginning
+ * 			of the executable or shared object file backing the vma
+ * 			which the *ip* falls in. It is *not* an offset relative
+ * 			to that object's base address. Accordingly, it must be
+ * 			adjusted by adding (sh_addr - sh_offset), where
+ * 			sh_{addr,offset} correspond to the executable section
+ * 			containing *file_offset* in the object, for comparisons
+ * 			to symbols' st_value to be valid.
  *
  * 		**bpf_get_stack**\ () can collect up to
  * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 59a217ca2dfd..f58d58e1d547 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3008,8 +3008,18 @@ union bpf_attr {
  * 		**BPF_F_USER_STACK**
  * 			Collect a user space stack instead of a kernel stack.
  * 		**BPF_F_USER_BUILD_ID**
- * 			Collect buildid+offset instead of ips for user stack,
- * 			only valid if **BPF_F_USER_STACK** is also specified.
+ * 			Collect (build_id, file_offset) instead of ips for user
+ * 			stack, only valid if **BPF_F_USER_STACK** is also
+ * 			specified.
+ *
+ * 			*file_offset* is an offset relative to the beginning
+ * 			of the executable or shared object file backing the vma
+ * 			which the *ip* falls in. It is *not* an offset relative
+ * 			to that object's base address. Accordingly, it must be
+ * 			adjusted by adding (sh_addr - sh_offset), where
+ * 			sh_{addr,offset} correspond to the executable section
+ * 			containing *file_offset* in the object, for comparisons
+ * 			to symbols' st_value to be valid.
  *
  * 		**bpf_get_stack**\ () can collect up to
  * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
-- 
cgit v1.2.3


From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 9 Aug 2022 08:08:02 +0200
Subject: bpf: Add BPF-helper for accessing CLOCK_TAI

Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai")
introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time
sensitive networks (TSN), where all nodes are synchronized by Precision Time
Protocol (PTP), it's helpful to have the possibility to generate timestamps
based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in
place, it becomes very convenient to correlate activity across different
machines in the network.

Use cases for such a BPF helper include functionalities such as Tx launch
time (e.g. ETF and TAPRIO Qdiscs) and timestamping.

Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
[Kurt: Wrote changelog and renamed helper]
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 13 +++++++++++++
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 14 ++++++++++++++
 tools/include/uapi/linux/bpf.h | 13 +++++++++++++
 5 files changed, 42 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20c26aed7896..a627a02cf8ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
+extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 534e33fb1029..7d1e2794d83e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5341,6 +5341,18 @@ union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * u64 bpf_ktime_get_tai_ns(void)
+ *	Description
+ *		A nonsettable system-wide clock derived from wall-clock time but
+ *		ignoring leap seconds.  This clock does not experience
+ *		discontinuities and backwards jumps caused by NTP inserting leap
+ *		seconds as CLOCK_REALTIME does.
+ *
+ *		See: **clock_gettime**\ (**CLOCK_TAI**)
+ *	Return
+ *		Current *ktime*.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5551,6 +5563,7 @@ union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(ktime_get_tai_ns),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..639437f36928 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2623,6 +2623,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
 
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1f961f9982d2..a95eb9fb01ff 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -198,6 +198,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+	/* NMI safe access to clock tai */
+	return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+	.func		= bpf_ktime_get_tai_ns,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
 BPF_CALL_0(bpf_get_current_pid_tgid)
 {
 	struct task_struct *task = current;
@@ -1617,6 +1629,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
 		return &bpf_ktime_get_boot_ns_proto;
+	case BPF_FUNC_ktime_get_tai_ns:
+		return &bpf_ktime_get_tai_ns_proto;
 	case BPF_FUNC_ringbuf_output:
 		return &bpf_ringbuf_output_proto;
 	case BPF_FUNC_ringbuf_reserve:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f58d58e1d547..e174ad28aeb7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5341,6 +5341,18 @@ union bpf_attr {
  *		**-EACCES** if the SYN cookie is not valid.
  *
  *		**-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * u64 bpf_ktime_get_tai_ns(void)
+ *	Description
+ *		A nonsettable system-wide clock derived from wall-clock time but
+ *		ignoring leap seconds.  This clock does not experience
+ *		discontinuities and backwards jumps caused by NTP inserting leap
+ *		seconds as CLOCK_REALTIME does.
+ *
+ *		See: **clock_gettime**\ (**CLOCK_TAI**)
+ *	Return
+ *		Current *ktime*.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5551,6 +5563,7 @@ union bpf_attr {
 	FN(tcp_raw_gen_syncookie_ipv6),	\
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
+	FN(ktime_get_tai_ns),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 134941683b89d05b5e5c28c817c95049ba409d01 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 6 Aug 2022 17:39:20 +0200
Subject: netfilter: ip6t_LOG: Fix a typo in a comment

s/_IPT_LOG_H/_IP6T_LOG_H/

While at it add some surrounding space to ease reading.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_LOG.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
index 23e91a9c2583..0b7b16dbdec2 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_LOG.h
@@ -17,4 +17,4 @@ struct ip6t_log_info {
 	char prefix[30];
 };
 
-#endif /*_IPT_LOG_H*/
+#endif /* _IP6T_LOG_H */
-- 
cgit v1.2.3


From f329a0ebeaba4ffe91d431e0ac1ca7f9165872a4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 9 Aug 2022 16:27:40 -0700
Subject: genetlink: correct uAPI defines

Commit 50a896cf2d6f ("genetlink: properly support per-op policy dumping")
seems to have copy'n'pasted things a little incorrectly.

The #define CTRL_ATTR_MCAST_GRP_MAX should have stayed right
after the previous enum. The new CTRL_ATTR_POLICY_* needs
its own define for MAX and that max should not contain the
superfluous _DUMP in the name.

We probably can't do anything about the CTRL_ATTR_POLICY_DUMP_MAX
any more, there's likely code which uses it. For consistency
(*cough* codegen *cough*) let's add the correctly name define
nonetheless.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index d83f214b4134..ddba3ca01e39 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -87,6 +87,8 @@ enum {
 	__CTRL_ATTR_MCAST_GRP_MAX,
 };
 
+#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
+
 enum {
 	CTRL_ATTR_POLICY_UNSPEC,
 	CTRL_ATTR_POLICY_DO,
@@ -96,7 +98,6 @@ enum {
 	CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1
 };
 
-#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
-
+#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1)
 
 #endif /* _UAPI__LINUX_GENERIC_NETLINK_H */
-- 
cgit v1.2.3


From ea024594b1dc5b6719c1400ae154690f5c203996 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:45 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_notify_data

Add queue_notify_data in struct virtio_pci_common_cfg, which comes from
here https://github.com/oasis-tcs/virtio-spec/issues/89

In order not to affect the API, add a dedicated structure struct
virtio_pci_modern_common_cfg to virtio_pci_modern.h.

Since I want to add queue_reset after queue_notify_data, I submitted
this patch first.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-26-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 7 +++++++
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index eb2bd9b4077d..41f5a018bd94 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,6 +5,13 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	__le16 queue_notify_data;	/* read-write */
+	__le16 padding;
+};
+
 struct virtio_pci_modern_device {
 	struct pci_dev *pci_dev;
 
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 3a86f36d7e3d..f5981a874481 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -202,6 +202,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_AVAILHI	44
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
+#define VIRTIO_PCI_COMMON_Q_NDATA	56
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit v1.2.3


From d94587b5bb5c4bba32fbc2bd92c86cc6de25167f Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:47 +0800
Subject: virtio: queue_reset: add VIRTIO_F_RING_RESET

Added VIRTIO_F_RING_RESET, it came from here

https://github.com/oasis-tcs/virtio-spec/issues/124
https://github.com/oasis-tcs/virtio-spec/issues/139

This feature indicates that the driver can reset a queue individually.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-28-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_config.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index f0fb0ae021c0..3c05162bc988 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -52,7 +52,7 @@
  * rest are per-device feature bits.
  */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		38
+#define VIRTIO_TRANSPORT_F_END		41
 
 #ifndef VIRTIO_CONFIG_NO_LEGACY
 /* Do we get callbacks when the ring is completely used, even if we've
@@ -98,4 +98,9 @@
  * Does the device support Single Root I/O Virtualization?
  */
 #define VIRTIO_F_SR_IOV			37
+
+/*
+ * This feature indicates that the driver can reset a queue individually.
+ */
+#define VIRTIO_F_RING_RESET		40
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
cgit v1.2.3


From 0cdd450e70510c9e13af8099e9f6c1467e6a0b91 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:49 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_reset

Add queue_reset in virtio_pci_modern_common_cfg.

 https://github.com/oasis-tcs/virtio-spec/issues/124
 https://github.com/oasis-tcs/virtio-spec/issues/139

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-30-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 2 +-
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 41f5a018bd94..05123b9a606f 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -9,7 +9,7 @@ struct virtio_pci_modern_common_cfg {
 	struct virtio_pci_common_cfg cfg;
 
 	__le16 queue_notify_data;	/* read-write */
-	__le16 padding;
+	__le16 queue_reset;		/* read-write */
 };
 
 struct virtio_pci_modern_device {
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index f5981a874481..f703afc7ad31 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -203,6 +203,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
 #define VIRTIO_PCI_COMMON_Q_NDATA	56
+#define VIRTIO_PCI_COMMON_Q_RESET	58
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit v1.2.3


From 699b045a8e43bd1063db4795be685bfd659649dc Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Mon, 18 Jul 2022 12:11:02 +0300
Subject: net: virtio_net: notifications coalescing support

New VirtIO network feature: VIRTIO_NET_F_NOTF_COAL.

Control a Virtio network device notifications coalescing parameters
using the control virtqueue.

A device that supports this fetature can receive
VIRTIO_NET_CTRL_NOTF_COAL control commands.

- VIRTIO_NET_CTRL_NOTF_COAL_TX_SET:
  Ask the network device to change the following parameters:
  - tx_usecs: Maximum number of usecs to delay a TX notification.
  - tx_max_packets: Maximum number of packets to send before a
    TX notification.

- VIRTIO_NET_CTRL_NOTF_COAL_RX_SET:
  Ask the network device to change the following parameters:
  - rx_usecs: Maximum number of usecs to delay a RX notification.
  - rx_max_packets: Maximum number of packets to receive before a
    RX notification.

VirtIO spec. patch:
https://lists.oasis-open.org/archives/virtio-comment/202206/msg00100.html

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Message-Id: <20220718091102.498774-1-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c        | 111 ++++++++++++++++++++++++++++++++++------
 include/uapi/linux/virtio_net.h |  34 +++++++++++-
 2 files changed, 129 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 16783ed782c5..d9c434b00e9b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -270,6 +270,12 @@ struct virtnet_info {
 	u8 duplex;
 	u32 speed;
 
+	/* Interrupt coalescing settings */
+	u32 tx_usecs;
+	u32 rx_usecs;
+	u32 tx_max_packets;
+	u32 rx_max_packets;
+
 	unsigned long guest_offloads;
 	unsigned long guest_offloads_capable;
 
@@ -2737,27 +2743,89 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
 	return 0;
 }
 
+static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi,
+				       struct ethtool_coalesce *ec)
+{
+	struct scatterlist sgs_tx, sgs_rx;
+	struct virtio_net_ctrl_coal_tx coal_tx;
+	struct virtio_net_ctrl_coal_rx coal_rx;
+
+	coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
+	coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames);
+	sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
+				  VIRTIO_NET_CTRL_NOTF_COAL_TX_SET,
+				  &sgs_tx))
+		return -EINVAL;
+
+	/* Save parameters */
+	vi->tx_usecs = ec->tx_coalesce_usecs;
+	vi->tx_max_packets = ec->tx_max_coalesced_frames;
+
+	coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
+	coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
+	sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx));
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
+				  VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
+				  &sgs_rx))
+		return -EINVAL;
+
+	/* Save parameters */
+	vi->rx_usecs = ec->rx_coalesce_usecs;
+	vi->rx_max_packets = ec->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
+{
+	/* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL
+	 * feature is negotiated.
+	 */
+	if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs)
+		return -EOPNOTSUPP;
+
+	if (ec->tx_max_coalesced_frames > 1 ||
+	    ec->rx_max_coalesced_frames != 1)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int virtnet_set_coalesce(struct net_device *dev,
 				struct ethtool_coalesce *ec,
 				struct kernel_ethtool_coalesce *kernel_coal,
 				struct netlink_ext_ack *extack)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int i, napi_weight;
-
-	if (ec->tx_max_coalesced_frames > 1 ||
-	    ec->rx_max_coalesced_frames != 1)
-		return -EINVAL;
+	int ret, i, napi_weight;
+	bool update_napi = false;
 
+	/* Can't change NAPI weight if the link is up */
 	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
 	if (napi_weight ^ vi->sq[0].napi.weight) {
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
+		else
+			update_napi = true;
+	}
+
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
+		ret = virtnet_send_notf_coal_cmds(vi, ec);
+	else
+		ret = virtnet_coal_params_supported(ec);
+
+	if (ret)
+		return ret;
+
+	if (update_napi) {
 		for (i = 0; i < vi->max_queue_pairs; i++)
 			vi->sq[i].napi.weight = napi_weight;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int virtnet_get_coalesce(struct net_device *dev,
@@ -2765,16 +2833,19 @@ static int virtnet_get_coalesce(struct net_device *dev,
 				struct kernel_ethtool_coalesce *kernel_coal,
 				struct netlink_ext_ack *extack)
 {
-	struct ethtool_coalesce ec_default = {
-		.cmd = ETHTOOL_GCOALESCE,
-		.rx_max_coalesced_frames = 1,
-	};
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	memcpy(ec, &ec_default, sizeof(ec_default));
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
+		ec->rx_coalesce_usecs = vi->rx_usecs;
+		ec->tx_coalesce_usecs = vi->tx_usecs;
+		ec->tx_max_coalesced_frames = vi->tx_max_packets;
+		ec->rx_max_coalesced_frames = vi->rx_max_packets;
+	} else {
+		ec->rx_max_coalesced_frames = 1;
 
-	if (vi->sq[0].napi.weight)
-		ec->tx_max_coalesced_frames = 1;
+		if (vi->sq[0].napi.weight)
+			ec->tx_max_coalesced_frames = 1;
+	}
 
 	return 0;
 }
@@ -2893,7 +2964,8 @@ static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
 }
 
 static const struct ethtool_ops virtnet_ethtool_ops = {
-	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
+	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES |
+		ETHTOOL_COALESCE_USECS,
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
@@ -3606,6 +3678,8 @@ static bool virtnet_validate_features(struct virtio_device *vdev)
 	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
 			     "VIRTIO_NET_F_CTRL_VQ") ||
 	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
+			     "VIRTIO_NET_F_CTRL_VQ") ||
+	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL,
 			     "VIRTIO_NET_F_CTRL_VQ"))) {
 		return false;
 	}
@@ -3742,6 +3816,13 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
+		vi->rx_usecs = 0;
+		vi->tx_usecs = 0;
+		vi->tx_max_packets = 0;
+		vi->rx_max_packets = 0;
+	}
+
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
 		vi->has_rss_hash_report = true;
 
@@ -3977,7 +4058,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
 	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
-	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT
+	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 3f55a4215f11..29ced55514d4 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -56,7 +56,7 @@
 #define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
-
+#define VIRTIO_NET_F_NOTF_COAL	53	/* Guest can handle notifications coalescing */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
@@ -355,4 +355,36 @@ struct virtio_net_hash_config {
 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS   5
 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET        0
 
+/*
+ * Control notifications coalescing.
+ *
+ * Request the device to change the notifications coalescing parameters.
+ *
+ * Available with the VIRTIO_NET_F_NOTF_COAL feature bit.
+ */
+#define VIRTIO_NET_CTRL_NOTF_COAL		6
+/*
+ * Set the tx-usecs/tx-max-packets patameters.
+ * tx-usecs - Maximum number of usecs to delay a TX notification.
+ * tx-max-packets - Maximum number of packets to send before a TX notification.
+ */
+struct virtio_net_ctrl_coal_tx {
+	__le32 tx_max_packets;
+	__le32 tx_usecs;
+};
+
+#define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET		0
+
+/*
+ * Set the rx-usecs/rx-max-packets patameters.
+ * rx-usecs - Maximum number of usecs to delay a RX notification.
+ * rx-max-frames - Maximum number of packets to receive before a RX notification.
+ */
+struct virtio_net_ctrl_coal_rx {
+	__le32 rx_max_packets;
+	__le32 rx_usecs;
+};
+
+#define VIRTIO_NET_CTRL_NOTF_COAL_RX_SET		1
+
 #endif /* _UAPI_LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3


From 79a463be9e0051997508d52cf411ed5e91d657f6 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Wed, 3 Aug 2022 12:55:22 +0800
Subject: vduse: Support registering userspace memory for IOVA regions

Introduce two ioctls: VDUSE_IOTLB_REG_UMEM and
VDUSE_IOTLB_DEREG_UMEM to support registering
and de-registering userspace memory for IOVA
regions.

Now it only supports registering userspace memory
for bounce buffer region in virtio-vdpa case.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220803045523.23851-5-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 141 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vduse.h         |  23 ++++++
 2 files changed, 164 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 3bc27de58f46..eedff0a3885a 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -21,6 +21,8 @@
 #include <linux/uio.h>
 #include <linux/vdpa.h>
 #include <linux/nospec.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
 #include <uapi/linux/vduse.h>
 #include <uapi/linux/vdpa.h>
 #include <uapi/linux/virtio_config.h>
@@ -64,6 +66,13 @@ struct vduse_vdpa {
 	struct vduse_dev *dev;
 };
 
+struct vduse_umem {
+	unsigned long iova;
+	unsigned long npages;
+	struct page **pages;
+	struct mm_struct *mm;
+};
+
 struct vduse_dev {
 	struct vduse_vdpa *vdev;
 	struct device *dev;
@@ -95,6 +104,8 @@ struct vduse_dev {
 	u8 status;
 	u32 vq_num;
 	u32 vq_align;
+	struct vduse_umem *umem;
+	struct mutex mem_lock;
 };
 
 struct vduse_dev_msg {
@@ -917,6 +928,102 @@ unlock:
 	return ret;
 }
 
+static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+				u64 iova, u64 size)
+{
+	int ret;
+
+	mutex_lock(&dev->mem_lock);
+	ret = -ENOENT;
+	if (!dev->umem)
+		goto unlock;
+
+	ret = -EINVAL;
+	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+		goto unlock;
+
+	vduse_domain_remove_user_bounce_pages(dev->domain);
+	unpin_user_pages_dirty_lock(dev->umem->pages,
+				    dev->umem->npages, true);
+	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
+	mmdrop(dev->umem->mm);
+	vfree(dev->umem->pages);
+	kfree(dev->umem);
+	dev->umem = NULL;
+	ret = 0;
+unlock:
+	mutex_unlock(&dev->mem_lock);
+	return ret;
+}
+
+static int vduse_dev_reg_umem(struct vduse_dev *dev,
+			      u64 iova, u64 uaddr, u64 size)
+{
+	struct page **page_list = NULL;
+	struct vduse_umem *umem = NULL;
+	long pinned = 0;
+	unsigned long npages, lock_limit;
+	int ret;
+
+	if (!dev->domain->bounce_map ||
+	    size != dev->domain->bounce_size ||
+	    iova != 0 || uaddr & ~PAGE_MASK)
+		return -EINVAL;
+
+	mutex_lock(&dev->mem_lock);
+	ret = -EEXIST;
+	if (dev->umem)
+		goto unlock;
+
+	ret = -ENOMEM;
+	npages = size >> PAGE_SHIFT;
+	page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
+			      GFP_KERNEL_ACCOUNT);
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!page_list || !umem)
+		goto unlock;
+
+	mmap_read_lock(current->mm);
+
+	lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+	if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
+		goto out;
+
+	pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
+				page_list, NULL);
+	if (pinned != npages) {
+		ret = pinned < 0 ? pinned : -ENOMEM;
+		goto out;
+	}
+
+	ret = vduse_domain_add_user_bounce_pages(dev->domain,
+						 page_list, pinned);
+	if (ret)
+		goto out;
+
+	atomic64_add(npages, &current->mm->pinned_vm);
+
+	umem->pages = page_list;
+	umem->npages = pinned;
+	umem->iova = iova;
+	umem->mm = current->mm;
+	mmgrab(current->mm);
+
+	dev->umem = umem;
+out:
+	if (ret && pinned > 0)
+		unpin_user_pages(page_list, pinned);
+
+	mmap_read_unlock(current->mm);
+unlock:
+	if (ret) {
+		vfree(page_list);
+		kfree(umem);
+	}
+	mutex_unlock(&dev->mem_lock);
+	return ret;
+}
+
 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
@@ -1089,6 +1196,38 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
 		break;
 	}
+	case VDUSE_IOTLB_REG_UMEM: {
+		struct vduse_iova_umem umem;
+
+		ret = -EFAULT;
+		if (copy_from_user(&umem, argp, sizeof(umem)))
+			break;
+
+		ret = -EINVAL;
+		if (!is_mem_zero((const char *)umem.reserved,
+				 sizeof(umem.reserved)))
+			break;
+
+		ret = vduse_dev_reg_umem(dev, umem.iova,
+					 umem.uaddr, umem.size);
+		break;
+	}
+	case VDUSE_IOTLB_DEREG_UMEM: {
+		struct vduse_iova_umem umem;
+
+		ret = -EFAULT;
+		if (copy_from_user(&umem, argp, sizeof(umem)))
+			break;
+
+		ret = -EINVAL;
+		if (!is_mem_zero((const char *)umem.reserved,
+				 sizeof(umem.reserved)))
+			break;
+
+		ret = vduse_dev_dereg_umem(dev, umem.iova,
+					   umem.size);
+		break;
+	}
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
@@ -1101,6 +1240,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
 {
 	struct vduse_dev *dev = file->private_data;
 
+	vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
 	spin_lock(&dev->msg_lock);
 	/* Make sure the inflight messages can processed after reconncection */
 	list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1163,6 +1303,7 @@ static struct vduse_dev *vduse_dev_create(void)
 		return NULL;
 
 	mutex_init(&dev->lock);
+	mutex_init(&dev->mem_lock);
 	spin_lock_init(&dev->msg_lock);
 	INIT_LIST_HEAD(&dev->send_list);
 	INIT_LIST_HEAD(&dev->recv_list);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 7cfe1c1280c0..9885e0571f09 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -210,6 +210,29 @@ struct vduse_vq_eventfd {
  */
 #define VDUSE_VQ_INJECT_IRQ	_IOW(VDUSE_BASE, 0x17, __u32)
 
+/**
+ * struct vduse_iova_umem - userspace memory configuration for one IOVA region
+ * @uaddr: start address of userspace memory, it must be aligned to page size
+ * @iova: start of the IOVA region
+ * @size: size of the IOVA region
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
+ * ioctls to register/de-register userspace memory for IOVA regions
+ */
+struct vduse_iova_umem {
+	__u64 uaddr;
+	__u64 iova;
+	__u64 size;
+	__u64 reserved[3];
+};
+
+/* Register userspace memory for IOVA regions */
+#define VDUSE_IOTLB_REG_UMEM	_IOW(VDUSE_BASE, 0x18, struct vduse_iova_umem)
+
+/* De-register the userspace memory. Caller should set iova and size field. */
+#define VDUSE_IOTLB_DEREG_UMEM	_IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**
-- 
cgit v1.2.3


From ad146355bfad627bd0717ece73997c6c93b1b82e Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Wed, 3 Aug 2022 12:55:23 +0800
Subject: vduse: Support querying information of IOVA regions

This introduces a new ioctl: VDUSE_IOTLB_GET_INFO to
support querying some information of IOVA regions.

Now it can be used to query whether the IOVA region
supports userspace memory registration.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Message-Id: <20220803045523.23851-6-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 39 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vduse.h         | 24 +++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index eedff0a3885a..41c0b29739f1 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1228,6 +1228,45 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 					   umem.size);
 		break;
 	}
+	case VDUSE_IOTLB_GET_INFO: {
+		struct vduse_iova_info info;
+		struct vhost_iotlb_map *map;
+		struct vduse_iova_domain *domain = dev->domain;
+
+		ret = -EFAULT;
+		if (copy_from_user(&info, argp, sizeof(info)))
+			break;
+
+		ret = -EINVAL;
+		if (info.start > info.last)
+			break;
+
+		if (!is_mem_zero((const char *)info.reserved,
+				 sizeof(info.reserved)))
+			break;
+
+		spin_lock(&domain->iotlb_lock);
+		map = vhost_iotlb_itree_first(domain->iotlb,
+					      info.start, info.last);
+		if (map) {
+			info.start = map->start;
+			info.last = map->last;
+			info.capability = 0;
+			if (domain->bounce_map && map->start == 0 &&
+			    map->last == domain->bounce_size - 1)
+				info.capability |= VDUSE_IOVA_CAP_UMEM;
+		}
+		spin_unlock(&domain->iotlb_lock);
+		if (!map)
+			break;
+
+		ret = -EFAULT;
+		if (copy_to_user(argp, &info, sizeof(info)))
+			break;
+
+		ret = 0;
+		break;
+	}
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 9885e0571f09..11bd48c72c6c 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -233,6 +233,30 @@ struct vduse_iova_umem {
 /* De-register the userspace memory. Caller should set iova and size field. */
 #define VDUSE_IOTLB_DEREG_UMEM	_IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)
 
+/**
+ * struct vduse_iova_info - information of one IOVA region
+ * @start: start of the IOVA region
+ * @last: last of the IOVA region
+ * @capability: capability of the IOVA regsion
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
+ * one IOVA region.
+ */
+struct vduse_iova_info {
+	__u64 start;
+	__u64 last;
+#define VDUSE_IOVA_CAP_UMEM (1 << 0)
+	__u64 capability;
+	__u64 reserved[3];
+};
+
+/*
+ * Find the first IOVA region that overlaps with the range [start, last]
+ * and return some information on it. Caller should set start and last fields.
+ */
+#define VDUSE_IOTLB_GET_INFO	_IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info)
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**
-- 
cgit v1.2.3


From 0723f1df5c3ec8a1112d150dab98e149361ef488 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:10 +0200
Subject: vhost-vdpa: introduce SUSPEND backend feature bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Userland knows if it can suspend the device or not by checking this feature
bit.

It's only offered if the vdpa driver backend implements the suspend()
operation callback, and to offer it or userland to ack it if the backend
does not offer that callback is an error.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-3-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c             | 16 +++++++++++++++-
 include/uapi/linux/vhost_types.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 2c997d77d266..092752fea8e1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -347,6 +347,14 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
 	return 0;
 }
 
+static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->suspend;
+}
+
 static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
 {
 	struct vdpa_device *vdpa = v->vdpa;
@@ -577,7 +585,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	if (cmd == VHOST_SET_BACKEND_FEATURES) {
 		if (copy_from_user(&features, featurep, sizeof(features)))
 			return -EFAULT;
-		if (features & ~VHOST_VDPA_BACKEND_FEATURES)
+		if (features & ~(VHOST_VDPA_BACKEND_FEATURES |
+				 BIT_ULL(VHOST_BACKEND_F_SUSPEND)))
+			return -EOPNOTSUPP;
+		if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) &&
+		     !vhost_vdpa_can_suspend(v))
 			return -EOPNOTSUPP;
 		vhost_set_backend_features(&v->vdev, features);
 		return 0;
@@ -628,6 +640,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 		break;
 	case VHOST_GET_BACKEND_FEATURES:
 		features = VHOST_VDPA_BACKEND_FEATURES;
+		if (vhost_vdpa_can_suspend(v))
+			features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND);
 		if (copy_to_user(featurep, &features, sizeof(features)))
 			r = -EFAULT;
 		break;
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 634cee485abb..1bdd6e363f4c 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -161,5 +161,7 @@ struct vhost_vdpa_iova_range {
  * message
  */
 #define VHOST_BACKEND_F_IOTLB_ASID  0x3
+/* Device can be suspended */
+#define VHOST_BACKEND_F_SUSPEND  0x4
 
 #endif
-- 
cgit v1.2.3


From f345a0143b4dd1cfc850009c6979a3801b86a06f Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:11 +0200
Subject: vhost-vdpa: uAPI to suspend the device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ioctl adds support for suspending the device from userspace.

This is a must before getting virtqueue indexes (base) for live migration,
since the device could modify them after userland gets them. There are
individual ways to perform that action for some devices
(VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING, ...) but there was no
way to perform it for any vhost device (and, in particular, vhost-vdpa).

After a successful return of the ioctl call the device must not process
more virtqueue descriptors. The device can answer to read or writes of
config fields as if it were not suspended. In particular, writing to
"queue_enable" with a value of 1 will not make the device start
processing buffers of the virtqueue.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-4-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vdpa.c       | 19 +++++++++++++++++++
 include/uapi/linux/vhost.h |  9 +++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 092752fea8e1..166044642fd5 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -478,6 +478,22 @@ static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp)
 	return 0;
 }
 
+/* After a successful return of ioctl the device must not process more
+ * virtqueue descriptors. The device can answer to read or writes of config
+ * fields as if it were not suspended. In particular, writing to "queue_enable"
+ * with a value of 1 will not make the device start processing buffers.
+ */
+static long vhost_vdpa_suspend(struct vhost_vdpa *v)
+{
+	struct vdpa_device *vdpa = v->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	if (!ops->suspend)
+		return -EOPNOTSUPP;
+
+	return ops->suspend(vdpa);
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 				   void __user *argp)
 {
@@ -654,6 +670,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
 	case VHOST_VDPA_GET_VQS_COUNT:
 		r = vhost_vdpa_get_vqs_count(v, argp);
 		break;
+	case VHOST_VDPA_SUSPEND:
+		r = vhost_vdpa_suspend(v);
+		break;
 	default:
 		r = vhost_dev_ioctl(&v->vdev, cmd, argp);
 		if (r == -ENOIOCTLCMD)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index cab645d4a645..f9f115a7c75b 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -171,4 +171,13 @@
 #define VHOST_VDPA_SET_GROUP_ASID	_IOW(VHOST_VIRTIO, 0x7C, \
 					     struct vhost_vring_state)
 
+/* Suspend a device so it does not process virtqueue requests anymore
+ *
+ * After the return of ioctl the device must preserve all the necessary state
+ * (the virtqueue vring base plus the possible device specific states) that is
+ * required for restoring in the future. The device must not change its
+ * configuration after that point.
+ */
+#define VHOST_VDPA_SUSPEND		_IO(VHOST_VIRTIO, 0x7D)
+
 #endif
-- 
cgit v1.2.3


From c2e75634cbe368065f140dd30bf8b1a0355158fd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 10 Aug 2022 09:45:47 -0700
Subject: net: atm: bring back zatm uAPI

Jiri reports that linux-atm does not build without this header.
Bring it back. It's completely dead code but we can't break
the build for user space :(

Reported-by: Jiri Slaby <jirislaby@kernel.org>
Fixes: 052e1f01bfae ("net: atm: remove support for ZeitNet ZN122x ATM devices")
Link: https://lore.kernel.org/all/8576aef3-37e4-8bae-bab5-08f82a78efd3@kernel.org/
Link: https://lore.kernel.org/r/20220810164547.484378-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/atm_zatm.h | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 include/uapi/linux/atm_zatm.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h
new file mode 100644
index 000000000000..5135027b93c1
--- /dev/null
+++ b/include/uapi/linux/atm_zatm.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* atm_zatm.h - Driver-specific declarations of the ZATM driver (for use by
+		driver-specific utilities) */
+
+/* Written 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef LINUX_ATM_ZATM_H
+#define LINUX_ATM_ZATM_H
+
+/*
+ * Note: non-kernel programs including this file must also include
+ * sys/types.h for struct timeval
+ */
+
+#include <linux/atmapi.h>
+#include <linux/atmioc.h>
+
+#define ZATM_GETPOOL	_IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc)
+						/* get pool statistics */
+#define ZATM_GETPOOLZ	_IOW('a',ATMIOC_SARPRV+2,struct atmif_sioc)
+						/* get statistics and zero */
+#define ZATM_SETPOOL	_IOW('a',ATMIOC_SARPRV+3,struct atmif_sioc)
+						/* set pool parameters */
+
+struct zatm_pool_info {
+	int ref_count;			/* free buffer pool usage counters */
+	int low_water,high_water;	/* refill parameters */
+	int rqa_count,rqu_count;	/* queue condition counters */
+	int offset,next_off;		/* alignment optimizations: offset */
+	int next_cnt,next_thres;	/* repetition counter and threshold */
+};
+
+struct zatm_pool_req {
+	int pool_num;			/* pool number */
+	struct zatm_pool_info info;	/* actual information */
+};
+
+#define ZATM_OAM_POOL		0	/* free buffer pool for OAM cells */
+#define ZATM_AAL0_POOL		1	/* free buffer pool for AAL0 cells */
+#define ZATM_AAL5_POOL_BASE	2	/* first AAL5 free buffer pool */
+#define ZATM_LAST_POOL	ZATM_AAL5_POOL_BASE+10 /* max. 64 kB */
+
+#define ZATM_TIMER_HISTORY_SIZE	16	/* number of timer adjustments to
+					   record; must be 2^n */
+
+#endif
-- 
cgit v1.2.3


From 4961d0772578e8737afe61370743f3bc22867111 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 12 Aug 2022 16:37:27 +0100
Subject: bpf: Clear up confusion in bpf_skb_adjust_room()'s documentation

Adding or removing room space _below_ layers 2 or 3, as the description
mentions, is ambiguous. This was written with a mental image of the
packet with layer 2 at the top, layer 3 under it, and so on. But it has
led users to believe that it was on lower layers (before the beginning
of the L2 and L3 headers respectively).

Let's make it more explicit, and specify between which layers the room
space is adjusted.

Reported-by: Rumen Telbizov <rumen.telbizov@menlosecurity.com>
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220812153727.224500-3-quentin@isovalent.com
---
 include/uapi/linux/bpf.h       | 6 ++++--
 tools/include/uapi/linux/bpf.h | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7d1e2794d83e..934a2a8beb87 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2573,10 +2573,12 @@ union bpf_attr {
  *		There are two supported modes at this time:
  *
  *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
- *		  (room space is added or removed below the layer 2 header).
+ * 		  (room space is added or removed between the layer 2 and
+ * 		  layer 3 headers).
  *
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
- * 		  (room space is added or removed below the layer 3 header).
+ * 		  (room space is added or removed between the layer 3 and
+ * 		  layer 4 headers).
  *
  *		The following flags are supported at this time:
  *
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e174ad28aeb7..1d6085e15fc8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2573,10 +2573,12 @@ union bpf_attr {
  *		There are two supported modes at this time:
  *
  *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
- *		  (room space is added or removed below the layer 2 header).
+ * 		  (room space is added or removed between the layer 2 and
+ * 		  layer 3 headers).
  *
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
- * 		  (room space is added or removed below the layer 3 header).
+ * 		  (room space is added or removed between the layer 3 and
+ * 		  layer 4 headers).
  *
  *		The following flags are supported at this time:
  *
-- 
cgit v1.2.3


From 5c669c4a4c6aa0489848093c93b8029f5c5c75ec Mon Sep 17 00:00:00 2001
From: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Date: Wed, 10 Aug 2022 11:40:03 +0200
Subject: virtio: kerneldocs fixes and enhancements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix variable names in some kerneldocs, naming in others.
Add kerneldocs for struct vring_desc and vring_interrupt.

Signed-off-by: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Message-Id: <20220810094004.1250-2-ricardo.canuelo@collabora.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 drivers/virtio/virtio_ring.c     |  8 ++++++++
 include/linux/virtio.h           |  6 +++---
 include/linux/virtio_config.h    |  6 +++---
 include/uapi/linux/virtio_ring.h | 16 +++++++++++-----
 4 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d66c8e6d0ef3..4620e9d79dde 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2426,6 +2426,14 @@ static inline bool more_used(const struct vring_virtqueue *vq)
 	return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
 }
 
+/**
+ * vring_interrupt - notify a virtqueue on an interrupt
+ * @irq: the IRQ number (ignored)
+ * @_vq: the struct virtqueue to notify
+ *
+ * Calls the callback function of @_vq to process the virtqueue
+ * notification.
+ */
 irqreturn_t vring_interrupt(int irq, void *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a3f73bb6733e..dcab9c7e8784 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,7 +11,7 @@
 #include <linux/gfp.h>
 
 /**
- * virtqueue - a queue to register buffers for sending or receiving.
+ * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
  * @callback: the function to call when buffers are consumed (can be NULL).
  * @name: the name of this virtqueue (mainly for debugging)
@@ -97,7 +97,7 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
 
 /**
- * virtio_device - representation of a device using virtio
+ * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
  * @config_enabled: configuration change reporting enabled
@@ -156,7 +156,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
 	list_for_each_entry(vq, &vdev->vqs, list)
 
 /**
- * virtio_driver - operations for a virtio I/O driver
+ * struct virtio_driver - operations for a virtio I/O driver
  * @driver: underlying device driver (populate name and owner).
  * @id_table: the ids serviced by this driver.
  * @feature_table: an array of feature numbers supported by this driver.
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..4b517649cfe8 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -239,7 +239,7 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
- * @vdev: the device
+ * @dev: the virtio device
  */
 static inline
 void virtio_synchronize_cbs(struct virtio_device *dev)
@@ -258,7 +258,7 @@ void virtio_synchronize_cbs(struct virtio_device *dev)
 
 /**
  * virtio_device_ready - enable vq use in probe function
- * @vdev: the device
+ * @dev: the virtio device
  *
  * Driver must call this to use vqs in the probe function.
  *
@@ -306,7 +306,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
 /**
  * virtqueue_set_affinity - setting affinity for a virtqueue
  * @vq: the virtqueue
- * @cpu: the cpu no.
+ * @cpu_mask: the cpu mask
  *
  * Pay attention the function are best-effort: the affinity hint may not be set
  * due to config support, irq type and sharing.
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 476d3e5c0fe7..f8c20d3de8da 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -93,15 +93,21 @@
 #define VRING_USED_ALIGN_SIZE 4
 #define VRING_DESC_ALIGN_SIZE 16
 
-/* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
+/**
+ * struct vring_desc - Virtio ring descriptors,
+ * 16 bytes long. These can chain together via @next.
+ *
+ * @addr: buffer address (guest-physical)
+ * @len: buffer length
+ * @flags: descriptor flags
+ * @next: index of the next descriptor in the chain,
+ *        if the VRING_DESC_F_NEXT flag is set. We chain unused
+ *        descriptors via this, too.
+ */
 struct vring_desc {
-	/* Address (guest-physical). */
 	__virtio64 addr;
-	/* Length. */
 	__virtio32 len;
-	/* The flags as indicated above. */
 	__virtio16 flags;
-	/* We chain unused descriptors via this, too */
 	__virtio16 next;
 };
 
-- 
cgit v1.2.3


From 3024d95a4c521c278a7504ee9e80c57c3a9750e0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 17 Aug 2022 23:32:09 +0200
Subject: bpf: Partially revert flexible-array member replacement

Partially revert 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays
with flexible-array members") given it breaks BPF UAPI.

For example, BPF CI run reveals build breakage under LLVM:

  [...]
    CLNG-BPF [test_maps] map_ptr_kern.o
    CLNG-BPF [test_maps] btf__core_reloc_arrays___diff_arr_val_sz.o
    CLNG-BPF [test_maps] test_bpf_cookie.o
  progs/map_ptr_kern.c:314:26: error: field 'trie_key' with variable sized type 'struct bpf_lpm_trie_key' not at the end of a struct or class is a GNU extension [-Werror,-Wgnu-variable-sized-type-not-at-end]
           struct bpf_lpm_trie_key trie_key;
                                   ^
    CLNG-BPF [test_maps] btf__core_reloc_type_based___diff.o
  1 error generated.
  make: *** [Makefile:521: /tmp/runner/work/bpf/bpf/tools/testing/selftests/bpf/map_ptr_kern.o] Error 1
  make: *** Waiting for unfinished jobs....
  [...]

Typical usage of the bpf_lpm_trie_key is that the struct gets embedded into
a user defined key for the LPM BPF map, from the selftest example:

  struct bpf_lpm_trie_key {                 <-- UAPI exported struct
         __u32   prefixlen;
         __u8    data[];
  };

  struct lpm_key {                          <-- BPF program defined struct
         struct bpf_lpm_trie_key trie_key;
         __u32 data;
  };

Undo this for BPF until a different solution can be found. It's the only flexible-
array member case in the UAPI header.

This was discovered in BPF CI after Dave reported that the include/uapi/linux/bpf.h
header was out of sync with tools/include/uapi/linux/bpf.h after 94dfc73e7cf4. And
the subsequent sync attempt failed CI.

Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members")
Reported-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/bpf/22aebc88-da67-f086-e620-dd4a16e2bc69@iogearbox.net
---
 include/uapi/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7bf9ba1329be..59a217ca2dfd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -79,7 +79,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
-	__u8	data[];	/* Arbitrary size */
+	__u8	data[0];	/* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {
-- 
cgit v1.2.3


From 5d81757835859760a38a54a87eff856d4e578836 Mon Sep 17 00:00:00 2001
From: Emeel Hakim <ehakim@nvidia.com>
Date: Thu, 18 Aug 2022 18:32:30 +0300
Subject: net: macsec: Expose MACSEC_SALT_LEN definition to user space

Expose MACSEC_SALT_LEN definition to user space to be
used in various user space applications such as iproute.
Iproute will use this as part of adding macsec extended
packet number support.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Emeel Hakim <ehakim@nvidia.com>
Link: https://lore.kernel.org/r/20220818153229.4721-1-ehakim@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/macsec.h           | 1 -
 include/uapi/linux/if_macsec.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/macsec.h b/include/net/macsec.h
index d6fa6b97f6ef..73780aa73644 100644
--- a/include/net/macsec.h
+++ b/include/net/macsec.h
@@ -14,7 +14,6 @@
 #define MACSEC_DEFAULT_PN_LEN 4
 #define MACSEC_XPN_PN_LEN 8
 
-#define MACSEC_SALT_LEN 12
 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */
 
 typedef u64 __bitwise sci_t;
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
index 3af2aa069a36..d5b6d1f37353 100644
--- a/include/uapi/linux/if_macsec.h
+++ b/include/uapi/linux/if_macsec.h
@@ -22,6 +22,8 @@
 
 #define MACSEC_KEYID_LEN 16
 
+#define MACSEC_SALT_LEN 12
+
 /* cipher IDs as per IEEE802.1AE-2018 (Table 14-1) */
 #define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL
 #define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL
-- 
cgit v1.2.3


From 43cc0ec38131c10557c771760ffdfdb74a2da155 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 11 Jul 2022 12:21:10 +0200
Subject: media: v4l2-ctrls: add change flag for when dimensions change

Add a new V4L2_EVENT_CTRL_CH_DIMENSIONS change flag that is issued
when the dimensions of an array change as a result of a
__v4l2_ctrl_modify_dimensions() call.

This will inform userspace that there are new dimensions.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/vidioc-dqevent.rst     | 5 +++++
 Documentation/userspace-api/media/videodev2.h.rst.exceptions | 1 +
 drivers/media/v4l2-core/v4l2-ctrls-api.c                     | 3 ++-
 include/uapi/linux/videodev2.h                               | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-dqevent.rst b/Documentation/userspace-api/media/v4l/vidioc-dqevent.rst
index 6eb40073c906..8db103760930 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-dqevent.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-dqevent.rst
@@ -332,6 +332,11 @@ call.
       - 0x0004
       - This control event was triggered because the minimum, maximum,
 	step or the default value of the control changed.
+    * - ``V4L2_EVENT_CTRL_CH_DIMENSIONS``
+      - 0x0008
+      - This control event was triggered because the dimensions of the
+	control changed. Note that the number of dimensions remains the
+	same.
 
 
 .. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.5cm}|
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 64b2c0b1f666..2a589d34b80e 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -514,6 +514,7 @@ replace define V4L2_EVENT_PRIVATE_START event-type
 replace define V4L2_EVENT_CTRL_CH_VALUE ctrl-changes-flags
 replace define V4L2_EVENT_CTRL_CH_FLAGS ctrl-changes-flags
 replace define V4L2_EVENT_CTRL_CH_RANGE ctrl-changes-flags
+replace define V4L2_EVENT_CTRL_CH_DIMENSIONS ctrl-changes-flags
 
 replace define V4L2_EVENT_SRC_CH_RESOLUTION src-changes-flags
 
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c
index 878da8592106..67fbdccda2d8 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-api.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c
@@ -1020,7 +1020,8 @@ int __v4l2_ctrl_modify_dimensions(struct v4l2_ctrl *ctrl,
 	for (i = 0; i < elems; i++)
 		ctrl->type_ops->init(ctrl, i, ctrl->p_cur);
 	cur_to_new(ctrl);
-	send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_VALUE);
+	send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_VALUE |
+			       V4L2_EVENT_CTRL_CH_DIMENSIONS);
 	return 0;
 }
 EXPORT_SYMBOL(__v4l2_ctrl_modify_dimensions);
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 01e630f2ec78..c415ce5b6829 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -2435,6 +2435,7 @@ struct v4l2_event_vsync {
 #define V4L2_EVENT_CTRL_CH_VALUE		(1 << 0)
 #define V4L2_EVENT_CTRL_CH_FLAGS		(1 << 1)
 #define V4L2_EVENT_CTRL_CH_RANGE		(1 << 2)
+#define V4L2_EVENT_CTRL_CH_DIMENSIONS		(1 << 3)
 
 struct v4l2_event_ctrl {
 	__u32 changes;
-- 
cgit v1.2.3


From 1202cdd665315c525b5237e96e0bedc76d7e754f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 17 Aug 2022 17:43:21 -0700
Subject: Remove DECnet support from kernel

DECnet is an obsolete network protocol that receives more attention
from kernel janitors than users. It belongs in computer protocol
history museum not in Linux kernel.

It has been "Orphaned" in kernel since 2010. The iproute2 support
for DECnet was dropped in 5.0 release. The documentation link on
Sourceforge says it is abandoned there as well.

Leave the UAPI alone to keep userspace programs compiling.
This means that there is still an empty neighbour table
for AF_DECNET.

The table of /proc/sys/net entries was updated to match
current directories and reformatted to be alphabetical.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: David Ahern <dsahern@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/kernel-parameters.txt    |    4 -
 Documentation/admin-guide/sysctl/net.rst           |   15 +-
 Documentation/networking/decnet.rst                |  243 --
 Documentation/networking/index.rst                 |    1 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |    7 -
 arch/mips/configs/decstation_64_defconfig          |    2 -
 arch/mips/configs/decstation_defconfig             |    2 -
 arch/mips/configs/decstation_r4k_defconfig         |    2 -
 arch/mips/configs/gpr_defconfig                    |    2 -
 arch/mips/configs/mtx1_defconfig                   |    2 -
 arch/mips/configs/rm200_defconfig                  |    2 -
 arch/powerpc/configs/ppc6xx_defconfig              |    2 -
 include/linux/netdevice.h                          |    4 -
 include/linux/netfilter.h                          |    5 -
 include/linux/netfilter_defs.h                     |    8 -
 include/net/dn.h                                   |  231 --
 include/net/dn_dev.h                               |  200 --
 include/net/dn_fib.h                               |  169 --
 include/net/dn_neigh.h                             |   32 -
 include/net/dn_nsp.h                               |  201 --
 include/net/dn_route.h                             |  118 -
 include/net/netns/netfilter.h                      |    3 -
 include/uapi/linux/dn.h                            |  149 --
 include/uapi/linux/netfilter_decnet.h              |   72 -
 include/uapi/linux/netlink.h                       |    2 +-
 net/Kconfig                                        |    2 -
 net/Makefile                                       |    1 -
 net/core/dev.c                                     |    4 +-
 net/core/neighbour.c                               |    3 -
 net/decnet/Kconfig                                 |   43 -
 net/decnet/Makefile                                |   10 -
 net/decnet/README                                  |    8 -
 net/decnet/af_decnet.c                             | 2404 --------------------
 net/decnet/dn_dev.c                                | 1433 ------------
 net/decnet/dn_fib.c                                |  798 -------
 net/decnet/dn_neigh.c                              |  607 -----
 net/decnet/dn_nsp_in.c                             |  907 --------
 net/decnet/dn_nsp_out.c                            |  696 ------
 net/decnet/dn_route.c                              | 1922 ----------------
 net/decnet/dn_rules.c                              |  253 --
 net/decnet/dn_table.c                              |  929 --------
 net/decnet/dn_timer.c                              |  104 -
 net/decnet/netfilter/Kconfig                       |   17 -
 net/decnet/netfilter/Makefile                      |    6 -
 net/decnet/netfilter/dn_rtmsg.c                    |  158 --
 net/decnet/sysctl_net_decnet.c                     |  362 ---
 net/netfilter/core.c                               |   10 -
 net/netfilter/nfnetlink_hook.c                     |    7 -
 49 files changed, 10 insertions(+), 12153 deletions(-)
 delete mode 100644 Documentation/networking/decnet.rst
 delete mode 100644 include/net/dn.h
 delete mode 100644 include/net/dn_dev.h
 delete mode 100644 include/net/dn_fib.h
 delete mode 100644 include/net/dn_neigh.h
 delete mode 100644 include/net/dn_nsp.h
 delete mode 100644 include/net/dn_route.h
 delete mode 100644 include/uapi/linux/dn.h
 delete mode 100644 include/uapi/linux/netfilter_decnet.h
 delete mode 100644 net/decnet/Kconfig
 delete mode 100644 net/decnet/Makefile
 delete mode 100644 net/decnet/README
 delete mode 100644 net/decnet/af_decnet.c
 delete mode 100644 net/decnet/dn_dev.c
 delete mode 100644 net/decnet/dn_fib.c
 delete mode 100644 net/decnet/dn_neigh.c
 delete mode 100644 net/decnet/dn_nsp_in.c
 delete mode 100644 net/decnet/dn_nsp_out.c
 delete mode 100644 net/decnet/dn_route.c
 delete mode 100644 net/decnet/dn_rules.c
 delete mode 100644 net/decnet/dn_table.c
 delete mode 100644 net/decnet/dn_timer.c
 delete mode 100644 net/decnet/netfilter/Kconfig
 delete mode 100644 net/decnet/netfilter/Makefile
 delete mode 100644 net/decnet/netfilter/dn_rtmsg.c
 delete mode 100644 net/decnet/sysctl_net_decnet.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d7f30902fda0..adfda56b2691 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -966,10 +966,6 @@
 
 	debugpat	[X86] Enable PAT debugging
 
-	decnet.addr=	[HW,NET]
-			Format: <area>[,<node>]
-			See also Documentation/networking/decnet.rst.
-
 	default_hugepagesz=
 			[HW] The size of the default HugeTLB page. This is
 			the size represented by the legacy /proc/ hugepages
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 805f2281e000..82879a9d5683 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -34,13 +34,14 @@ Table : Subdirectories in /proc/sys/net
  ========= =================== = ========== ==================
  Directory Content               Directory  Content
  ========= =================== = ========== ==================
- core      General parameter     appletalk  Appletalk protocol
- unix      Unix domain sockets   netrom     NET/ROM
- 802       E802 protocol         ax25       AX25
- ethernet  Ethernet protocol     rose       X.25 PLP layer
- ipv4      IP version 4          x25        X.25 protocol
- bridge    Bridging              decnet     DEC net
- ipv6      IP version 6          tipc       TIPC
+ 802       E802 protocol         mptcp     Multipath TCP
+ appletalk Appletalk protocol    netfilter Network Filter
+ ax25      AX25                  netrom     NET/ROM
+ bridge    Bridging              rose      X.25 PLP layer
+ core      General parameter     tipc      TIPC
+ ethernet  Ethernet protocol     unix      Unix domain sockets
+ ipv4      IP version 4          x25       X.25 protocol
+ ipv6      IP version 6
  ========= =================== = ========== ==================
 
 1. /proc/sys/net/core - Network core options
diff --git a/Documentation/networking/decnet.rst b/Documentation/networking/decnet.rst
deleted file mode 100644
index b8bc11ff8370..000000000000
--- a/Documentation/networking/decnet.rst
+++ /dev/null
@@ -1,243 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=========================================
-Linux DECnet Networking Layer Information
-=========================================
-
-1. Other documentation....
-==========================
-
-   - Project Home Pages
-     - http://www.chygwyn.com/				   - Kernel info
-     - http://linux-decnet.sourceforge.net/                - Userland tools
-     - http://www.sourceforge.net/projects/linux-decnet/   - Status page
-
-2. Configuring the kernel
-=========================
-
-Be sure to turn on the following options:
-
-    - CONFIG_DECNET (obviously)
-    - CONFIG_PROC_FS (to see what's going on)
-    - CONFIG_SYSCTL (for easy configuration)
-
-if you want to try out router support (not properly debugged yet)
-you'll need the following options as well...
-
-    - CONFIG_DECNET_ROUTER (to be able to add/delete routes)
-    - CONFIG_NETFILTER (will be required for the DECnet routing daemon)
-
-Don't turn on SIOCGIFCONF support for DECnet unless you are really sure
-that you need it, in general you won't and it can cause ifconfig to
-malfunction.
-
-Run time configuration has changed slightly from the 2.4 system. If you
-want to configure an endnode, then the simplified procedure is as follows:
-
- - Set the MAC address on your ethernet card before starting _any_ other
-   network protocols.
-
-As soon as your network card is brought into the UP state, DECnet should
-start working. If you need something more complicated or are unsure how
-to set the MAC address, see the next section. Also all configurations which
-worked with 2.4 will work under 2.5 with no change.
-
-3. Command line options
-=======================
-
-You can set a DECnet address on the kernel command line for compatibility
-with the 2.4 configuration procedure, but in general it's not needed any more.
-If you do st a DECnet address on the command line, it has only one purpose
-which is that its added to the addresses on the loopback device.
-
-With 2.4 kernels, DECnet would only recognise addresses as local if they
-were added to the loopback device. In 2.5, any local interface address
-can be used to loop back to the local machine. Of course this does not
-prevent you adding further addresses to the loopback device if you
-want to.
-
-N.B. Since the address list of an interface determines the addresses for
-which "hello" messages are sent, if you don't set an address on the loopback
-interface then you won't see any entries in /proc/net/neigh for the local
-host until such time as you start a connection. This doesn't affect the
-operation of the local communications in any other way though.
-
-The kernel command line takes options looking like the following::
-
-    decnet.addr=1,2
-
-the two numbers are the node address 1,2 = 1.2 For 2.2.xx kernels
-and early 2.3.xx kernels, you must use a comma when specifying the
-DECnet address like this. For more recent 2.3.xx kernels, you may
-use almost any character except space, although a `.` would be the most
-obvious choice :-)
-
-There used to be a third number specifying the node type. This option
-has gone away in favour of a per interface node type. This is now set
-using /proc/sys/net/decnet/conf/<dev>/forwarding. This file can be
-set with a single digit, 0=EndNode, 1=L1 Router and  2=L2 Router.
-
-There are also equivalent options for modules. The node address can
-also be set through the /proc/sys/net/decnet/ files, as can other system
-parameters.
-
-Currently the only supported devices are ethernet and ip_gre. The
-ethernet address of your ethernet card has to be set according to the DECnet
-address of the node in order for it to be autoconfigured (and then appear in
-/proc/net/decnet_dev). There is a utility available at the above
-FTP sites called dn2ethaddr which can compute the correct ethernet
-address to use. The address can be set by ifconfig either before or
-at the time the device is brought up. If you are using RedHat you can
-add the line::
-
-    MACADDR=AA:00:04:00:03:04
-
-or something similar, to /etc/sysconfig/network-scripts/ifcfg-eth0 or
-wherever your network card's configuration lives. Setting the MAC address
-of your ethernet card to an address starting with "hi-ord" will cause a
-DECnet address which matches to be added to the interface (which you can
-verify with iproute2).
-
-The default device for routing can be set through the /proc filesystem
-by setting /proc/sys/net/decnet/default_device to the
-device you want DECnet to route packets out of when no specific route
-is available. Usually this will be eth0, for example::
-
-    echo -n "eth0" >/proc/sys/net/decnet/default_device
-
-If you don't set the default device, then it will default to the first
-ethernet card which has been autoconfigured as described above. You can
-confirm that by looking in the default_device file of course.
-
-There is a list of what the other files under /proc/sys/net/decnet/ do
-on the kernel patch web site (shown above).
-
-4. Run time kernel configuration
-================================
-
-
-This is either done through the sysctl/proc interface (see the kernel web
-pages for details on what the various options do) or through the iproute2
-package in the same way as IPv4/6 configuration is performed.
-
-Documentation for iproute2 is included with the package, although there is
-as yet no specific section on DECnet, most of the features apply to both
-IP and DECnet, albeit with DECnet addresses instead of IP addresses and
-a reduced functionality.
-
-If you want to configure a DECnet router you'll need the iproute2 package
-since its the _only_ way to add and delete routes currently. Eventually
-there will be a routing daemon to send and receive routing messages for
-each interface and update the kernel routing tables accordingly. The
-routing daemon will use netfilter to listen to routing packets, and
-rtnetlink to update the kernels routing tables.
-
-The DECnet raw socket layer has been removed since it was there purely
-for use by the routing daemon which will now use netfilter (a much cleaner
-and more generic solution) instead.
-
-5. How can I tell if its working?
-=================================
-
-Here is a quick guide of what to look for in order to know if your DECnet
-kernel subsystem is working.
-
-   - Is the node address set (see /proc/sys/net/decnet/node_address)
-   - Is the node of the correct type
-     (see /proc/sys/net/decnet/conf/<dev>/forwarding)
-   - Is the Ethernet MAC address of each Ethernet card set to match
-     the DECnet address. If in doubt use the dn2ethaddr utility available
-     at the ftp archive.
-   - If the previous two steps are satisfied, and the Ethernet card is up,
-     you should find that it is listed in /proc/net/decnet_dev and also
-     that it appears as a directory in /proc/sys/net/decnet/conf/. The
-     loopback device (lo) should also appear and is required to communicate
-     within a node.
-   - If you have any DECnet routers on your network, they should appear
-     in /proc/net/decnet_neigh, otherwise this file will only contain the
-     entry for the node itself (if it doesn't check to see if lo is up).
-   - If you want to send to any node which is not listed in the
-     /proc/net/decnet_neigh file, you'll need to set the default device
-     to point to an Ethernet card with connection to a router. This is
-     again done with the /proc/sys/net/decnet/default_device file.
-   - Try starting a simple server and client, like the dnping/dnmirror
-     over the loopback interface. With luck they should communicate.
-     For this step and those after, you'll need the DECnet library
-     which can be obtained from the above ftp sites as well as the
-     actual utilities themselves.
-   - If this seems to work, then try talking to a node on your local
-     network, and see if you can obtain the same results.
-   - At this point you are on your own... :-)
-
-6. How to send a bug report
-===========================
-
-If you've found a bug and want to report it, then there are several things
-you can do to help me work out exactly what it is that is wrong. Useful
-information (_most_ of which _is_ _essential_) includes:
-
- - What kernel version are you running ?
- - What version of the patch are you running ?
- - How far though the above set of tests can you get ?
- - What is in the /proc/decnet* files and /proc/sys/net/decnet/* files ?
- - Which services are you running ?
- - Which client caused the problem ?
- - How much data was being transferred ?
- - Was the network congested ?
- - How can the problem be reproduced ?
- - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of
-   tcpdump don't understand how to dump DECnet properly, so including
-   the hex listing of the packet contents is _essential_, usually the -x flag.
-   You may also need to increase the length grabbed with the -s flag. The
-   -e flag also provides very useful information (ethernet MAC addresses))
-
-7. MAC FAQ
-==========
-
-A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet
-interact and how to get the best performance from your hardware.
-
-Ethernet cards are designed to normally only pass received network frames
-to a host computer when they are addressed to it, or to the broadcast address.
-
-Linux has an interface which allows the setting of extra addresses for
-an ethernet card to listen to. If the ethernet card supports it, the
-filtering operation will be done in hardware, if not the extra unwanted packets
-received will be discarded by the host computer. In the latter case,
-significant processor time and bus bandwidth can be used up on a busy
-network (see the NAPI documentation for a longer explanation of these
-effects).
-
-DECnet makes use of this interface to allow running DECnet on an ethernet
-card which has already been configured using TCP/IP (presumably using the
-built in MAC address of the card, as usual) and/or to allow multiple DECnet
-addresses on each physical interface. If you do this, be aware that if your
-ethernet card doesn't support perfect hashing in its MAC address filter
-then your computer will be doing more work than required. Some cards
-will simply set themselves into promiscuous mode in order to receive
-packets from the DECnet specified addresses. So if you have one of these
-cards its better to set the MAC address of the card as described above
-to gain the best efficiency. Better still is to use a card which supports
-NAPI as well.
-
-
-8. Mailing list
-===============
-
-If you are keen to get involved in development, or want to ask questions
-about configuration, or even just report bugs, then there is a mailing
-list that you can join, details are at:
-
-http://sourceforge.net/mail/?group_id=4993
-
-9. Legal Info
-=============
-
-The Linux DECnet project team have placed their code under the GPL. The
-software is provided "as is" and without warranty express or implied.
-DECnet is a trademark of Compaq. This software is not a product of
-Compaq. We acknowledge the help of people at Compaq in providing extra
-documentation above and beyond what was previously publicly available.
-
-Steve Whitehouse <SteveW@ACM.org>
-
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 03b215bddde8..bacadd09e570 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -47,7 +47,6 @@ Contents:
    cdc_mbim
    dccp
    dctcp
-   decnet
    dns_resolver
    driver
    eql
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 3b985b19f39d..5f81e2a24a5c 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -308,7 +308,6 @@ Code  Seq#    Include File                                           Comments
 0x89  00-06  arch/x86/include/asm/sockios.h
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
-0x89  E0-EF  linux/dn.h                                              PROTOPRIVATE range
 0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
 0x8B  all    linux/wireless.h
 0x8C  00-3F                                                          WiNRADiO driver
diff --git a/MAINTAINERS b/MAINTAINERS
index f512b430c7cb..61a34f63633c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5719,13 +5719,6 @@ F:	include/linux/tfrc.h
 F:	include/uapi/linux/dccp.h
 F:	net/dccp/
 
-DECnet NETWORK LAYER
-L:	linux-decnet-user@lists.sourceforge.net
-S:	Orphan
-W:	http://linux-decnet.sourceforge.net
-F:	Documentation/networking/decnet.rst
-F:	net/decnet/
-
 DECSTATION PLATFORM SUPPORT
 M:	"Maciej W. Rozycki" <macro@orcam.me.uk>
 L:	linux-mips@vger.kernel.org
diff --git a/arch/mips/configs/decstation_64_defconfig b/arch/mips/configs/decstation_64_defconfig
index 0021427a1bbe..4044f2829759 100644
--- a/arch/mips/configs/decstation_64_defconfig
+++ b/arch/mips/configs/decstation_64_defconfig
@@ -53,8 +53,6 @@ CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
 CONFIG_IP_SCTP=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
-CONFIG_DECNET_ROUTER=y
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 # CONFIG_FW_LOADER is not set
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index 7a97a0818ce4..157fc57520a7 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -49,8 +49,6 @@ CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
 CONFIG_IP_SCTP=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
-CONFIG_DECNET_ROUTER=y
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 # CONFIG_FW_LOADER is not set
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index a0643363526d..f73c26ebfc83 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -48,8 +48,6 @@ CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
 CONFIG_IP_SCTP=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
-CONFIG_DECNET_ROUTER=y
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 # CONFIG_FW_LOADER is not set
diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig
index d82f4ebf687f..ce8a444957c1 100644
--- a/arch/mips/configs/gpr_defconfig
+++ b/arch/mips/configs/gpr_defconfig
@@ -69,7 +69,6 @@ CONFIG_IP_NF_RAW=m
 CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_DECNET_NF_GRABULATOR=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -99,7 +98,6 @@ CONFIG_ATM_MPOA=m
 CONFIG_ATM_BR2684=m
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
 CONFIG_LLC2=m
 CONFIG_ATALK=m
 CONFIG_DEV_APPLETALK=m
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index 4194e79b435c..1339c062a042 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -116,7 +116,6 @@ CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
-CONFIG_DECNET_NF_GRABULATOR=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -146,7 +145,6 @@ CONFIG_ATM_MPOA=m
 CONFIG_ATM_BR2684=m
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
 CONFIG_LLC2=m
 CONFIG_ATALK=m
 CONFIG_DEV_APPLETALK=m
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 7d6f235e8ccb..04c681bd716e 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -116,7 +116,6 @@ CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
-CONFIG_DECNET_NF_GRABULATOR=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -137,7 +136,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m
 CONFIG_BRIDGE_EBT_SNAT=m
 CONFIG_BRIDGE_EBT_LOG=m
 CONFIG_BRIDGE=m
-CONFIG_DECNET=m
 CONFIG_NET_SCHED=y
 CONFIG_NET_SCH_CBQ=m
 CONFIG_NET_SCH_HTB=m
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 91967824272e..a24f484bfbd2 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -243,8 +243,6 @@ CONFIG_ATM_LANE=m
 CONFIG_ATM_BR2684=m
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
-CONFIG_DECNET=m
-CONFIG_DECNET_ROUTER=y
 CONFIG_ATALK=m
 CONFIG_DEV_APPLETALK=m
 CONFIG_IPDDP=m
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a3cb93c3dcc..64e8662632f8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1837,7 +1837,6 @@ enum netdev_ml_priv_type {
  *	@tipc_ptr:	TIPC specific data
  *	@atalk_ptr:	AppleTalk link
  *	@ip_ptr:	IPv4 specific data
- *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
@@ -2133,9 +2132,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_ATALK)
 	void 			*atalk_ptr;
 #endif
-#if IS_ENABLED(CONFIG_DECNET)
-	struct dn_dev __rcu     *dn_ptr;
-#endif
 #if IS_ENABLED(CONFIG_AX25)
 	void			*ax25_ptr;
 #endif
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index c2c6f332fb90..d8817d381c14 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -243,11 +243,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
 #endif
 		break;
-#if IS_ENABLED(CONFIG_DECNET)
-	case NFPROTO_DECNET:
-		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
-		break;
-#endif
 	default:
 		WARN_ON_ONCE(1);
 		break;
diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h
index 8dddfb151f00..a5f7bef1b3a4 100644
--- a/include/linux/netfilter_defs.h
+++ b/include/linux/netfilter_defs.h
@@ -7,14 +7,6 @@
 /* in/out/forward only */
 #define NF_ARP_NUMHOOKS 3
 
-/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */
-#define NF_DN_NUMHOOKS 7
-
-#if IS_ENABLED(CONFIG_DECNET)
-/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */
-#define NF_MAX_HOOKS	NF_DN_NUMHOOKS
-#else
 #define NF_MAX_HOOKS	NF_INET_NUMHOOKS
-#endif
 
 #endif
diff --git a/include/net/dn.h b/include/net/dn.h
deleted file mode 100644
index ba9655b0098a..000000000000
--- a/include/net/dn.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_DN_H
-#define _NET_DN_H
-
-#include <linux/dn.h>
-#include <net/sock.h>
-#include <net/flow.h>
-#include <asm/byteorder.h>
-#include <asm/unaligned.h>
-
-struct dn_scp                                   /* Session Control Port */
-{
-        unsigned char           state;
-#define DN_O     1                      /* Open                 */
-#define DN_CR    2                      /* Connect Receive      */
-#define DN_DR    3                      /* Disconnect Reject    */
-#define DN_DRC   4                      /* Discon. Rej. Complete*/
-#define DN_CC    5                      /* Connect Confirm      */
-#define DN_CI    6                      /* Connect Initiate     */
-#define DN_NR    7                      /* No resources         */
-#define DN_NC    8                      /* No communication     */
-#define DN_CD    9                      /* Connect Delivery     */
-#define DN_RJ    10                     /* Rejected             */
-#define DN_RUN   11                     /* Running              */
-#define DN_DI    12                     /* Disconnect Initiate  */
-#define DN_DIC   13                     /* Disconnect Complete  */
-#define DN_DN    14                     /* Disconnect Notificat */
-#define DN_CL    15                     /* Closed               */
-#define DN_CN    16                     /* Closed Notification  */
-
-        __le16          addrloc;
-        __le16          addrrem;
-        __u16          numdat;
-        __u16          numoth;
-        __u16          numoth_rcv;
-        __u16          numdat_rcv;
-        __u16          ackxmt_dat;
-        __u16          ackxmt_oth;
-        __u16          ackrcv_dat;
-        __u16          ackrcv_oth;
-        __u8           flowrem_sw;
-	__u8           flowloc_sw;
-#define DN_SEND         2
-#define DN_DONTSEND     1
-#define DN_NOCHANGE     0
-	__u16		flowrem_dat;
-	__u16		flowrem_oth;
-	__u16		flowloc_dat;
-	__u16		flowloc_oth;
-	__u8		services_rem;
-	__u8		services_loc;
-	__u8		info_rem;
-	__u8		info_loc;
-
-	__u16		segsize_rem;
-	__u16		segsize_loc;
-
-	__u8		nonagle;
-	__u8		multi_ireq;
-	__u8		accept_mode;
-	unsigned long		seg_total; /* Running total of current segment */
-
-	struct optdata_dn     conndata_in;
-	struct optdata_dn     conndata_out;
-	struct optdata_dn     discdata_in;
-	struct optdata_dn     discdata_out;
-        struct accessdata_dn  accessdata;
-
-        struct sockaddr_dn addr; /* Local address  */
-	struct sockaddr_dn peer; /* Remote address */
-
-	/*
-	 * In this case the RTT estimation is not specified in the
-	 * docs, nor is any back off algorithm. Here we follow well
-	 * known tcp algorithms with a few small variations.
-	 *
-	 * snd_window: Max number of packets we send before we wait for
-	 *             an ack to come back. This will become part of a
-	 *             more complicated scheme when we support flow
-	 *             control.
-	 *
-	 * nsp_srtt:   Round-Trip-Time (x8) in jiffies. This is a rolling
-	 *             average.
-	 * nsp_rttvar: Round-Trip-Time-Varience (x4) in jiffies. This is the
-	 *             varience of the smoothed average (but calculated in
-	 *             a simpler way than for normal statistical varience
-	 *             calculations).
-	 *
-	 * nsp_rxtshift: Backoff counter. Value is zero normally, each time
-	 *               a packet is lost is increases by one until an ack
-	 *               is received. Its used to index an array of backoff
-	 *               multipliers.
-	 */
-#define NSP_MIN_WINDOW 1
-#define NSP_MAX_WINDOW (0x07fe)
-	unsigned long max_window;
-	unsigned long snd_window;
-#define NSP_INITIAL_SRTT (HZ)
-	unsigned long nsp_srtt;
-#define NSP_INITIAL_RTTVAR (HZ*3)
-	unsigned long nsp_rttvar;
-#define NSP_MAXRXTSHIFT 12
-	unsigned long nsp_rxtshift;
-
-	/*
-	 * Output queues, one for data, one for otherdata/linkservice
-	 */
-	struct sk_buff_head data_xmit_queue;
-	struct sk_buff_head other_xmit_queue;
-
-	/*
-	 * Input queue for other data
-	 */
-	struct sk_buff_head other_receive_queue;
-	int other_report;
-
-	/*
-	 * Stuff to do with the slow timer
-	 */
-	unsigned long stamp;          /* time of last transmit */
-	unsigned long persist;
-	int (*persist_fxn)(struct sock *sk);
-	unsigned long keepalive;
-	void (*keepalive_fxn)(struct sock *sk);
-
-};
-
-static inline struct dn_scp *DN_SK(struct sock *sk)
-{
-	return (struct dn_scp *)(sk + 1);
-}
-
-/*
- * src,dst : Source and Destination DECnet addresses
- * hops : Number of hops through the network
- * dst_port, src_port : NSP port numbers
- * services, info : Useful data extracted from conninit messages
- * rt_flags : Routing flags byte
- * nsp_flags : NSP layer flags byte
- * segsize : Size of segment
- * segnum : Number, for data, otherdata and linkservice
- * xmit_count : Number of times we've transmitted this skb
- * stamp : Time stamp of most recent transmission, used in RTT calculations
- * iif: Input interface number
- *
- * As a general policy, this structure keeps all addresses in network
- * byte order, and all else in host byte order. Thus dst, src, dst_port
- * and src_port are in network order. All else is in host order.
- * 
- */
-#define DN_SKB_CB(skb) ((struct dn_skb_cb *)(skb)->cb)
-struct dn_skb_cb {
-	__le16 dst;
-	__le16 src;
-	__u16 hops;
-	__le16 dst_port;
-	__le16 src_port;
-	__u8 services;
-	__u8 info;
-	__u8 rt_flags;
-	__u8 nsp_flags;
-	__u16 segsize;
-	__u16 segnum;
-	__u16 xmit_count;
-	unsigned long stamp;
-	int iif;
-};
-
-static inline __le16 dn_eth2dn(const unsigned char *ethaddr)
-{
-	return get_unaligned((__le16 *)(ethaddr + 4));
-}
-
-static inline __le16 dn_saddr2dn(struct sockaddr_dn *saddr)
-{
-	return *(__le16 *)saddr->sdn_nodeaddr;
-}
-
-static inline void dn_dn2eth(unsigned char *ethaddr, __le16 addr)
-{
-	__u16 a = le16_to_cpu(addr);
-	ethaddr[0] = 0xAA;
-	ethaddr[1] = 0x00;
-	ethaddr[2] = 0x04;
-	ethaddr[3] = 0x00;
-	ethaddr[4] = (__u8)(a & 0xff);
-	ethaddr[5] = (__u8)(a >> 8);
-}
-
-static inline void dn_sk_ports_copy(struct flowidn *fld, struct dn_scp *scp)
-{
-	fld->fld_sport = scp->addrloc;
-	fld->fld_dport = scp->addrrem;
-}
-
-unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu);
-void dn_register_sysctl(void);
-void dn_unregister_sysctl(void);
-
-#define DN_MENUVER_ACC 0x01
-#define DN_MENUVER_USR 0x02
-#define DN_MENUVER_PRX 0x04
-#define DN_MENUVER_UIC 0x08
-
-struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr);
-struct sock *dn_find_by_skb(struct sk_buff *skb);
-#define DN_ASCBUF_LEN 9
-char *dn_addr2asc(__u16, char *);
-int dn_destroy_timer(struct sock *sk);
-
-int dn_sockaddr2username(struct sockaddr_dn *addr, unsigned char *buf,
-			 unsigned char type);
-int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *addr,
-			 unsigned char *type);
-
-void dn_start_slow_timer(struct sock *sk);
-void dn_stop_slow_timer(struct sock *sk);
-
-extern __le16 decnet_address;
-extern int decnet_debug_level;
-extern int decnet_time_wait;
-extern int decnet_dn_count;
-extern int decnet_di_count;
-extern int decnet_dr_count;
-extern int decnet_no_fc_max_cwnd;
-
-extern long sysctl_decnet_mem[3];
-extern int sysctl_decnet_wmem[3];
-extern int sysctl_decnet_rmem[3];
-
-#endif /* _NET_DN_H */
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
deleted file mode 100644
index bec303ea8367..000000000000
--- a/include/net/dn_dev.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_DN_DEV_H
-#define _NET_DN_DEV_H
-
-#include <linux/netdevice.h>
-
-struct dn_dev;
-
-struct dn_ifaddr {
-	struct dn_ifaddr __rcu *ifa_next;
-	struct dn_dev    *ifa_dev;
-	__le16            ifa_local;
-	__le16            ifa_address;
-	__u32             ifa_flags;
-	__u8              ifa_scope;
-	char              ifa_label[IFNAMSIZ];
-	struct rcu_head   rcu;
-};
-
-#define DN_DEV_S_RU  0 /* Run - working normally   */
-#define DN_DEV_S_CR  1 /* Circuit Rejected         */
-#define DN_DEV_S_DS  2 /* Data Link Start          */
-#define DN_DEV_S_RI  3 /* Routing Layer Initialize */
-#define DN_DEV_S_RV  4 /* Routing Layer Verify     */
-#define DN_DEV_S_RC  5 /* Routing Layer Complete   */
-#define DN_DEV_S_OF  6 /* Off                      */
-#define DN_DEV_S_HA  7 /* Halt                     */
-
-
-/*
- * The dn_dev_parms structure contains the set of parameters
- * for each device (hence inclusion in the dn_dev structure)
- * and an array is used to store the default types of supported
- * device (in dn_dev.c).
- *
- * The type field matches the ARPHRD_ constants and is used in
- * searching the list for supported devices when new devices
- * come up.
- *
- * The mode field is used to find out if a device is broadcast,
- * multipoint, or pointopoint. Please note that DECnet thinks
- * different ways about devices to the rest of the kernel
- * so the normal IFF_xxx flags are invalid here. For devices
- * which can be any combination of the previously mentioned
- * attributes, you can set this on a per device basis by
- * installing an up() routine.
- *
- * The device state field, defines the initial state in which the
- * device will come up. In the dn_dev structure, it is the actual
- * state.
- *
- * Things have changed here. I've killed timer1 since it's a user space
- * issue for a user space routing deamon to sort out. The kernel does
- * not need to be bothered with it.
- *
- * Timers:
- * t2 - Rate limit timer, min time between routing and hello messages
- * t3 - Hello timer, send hello messages when it expires
- *
- * Callbacks:
- * up() - Called to initialize device, return value can veto use of
- *        device with DECnet.
- * down() - Called to turn device off when it goes down
- * timer3() - Called once for each ifaddr when timer 3 goes off
- * 
- * sysctl - Hook for sysctl things
- *
- */
-struct dn_dev_parms {
-	int type;	          /* ARPHRD_xxx                         */
-	int mode;	          /* Broadcast, Unicast, Mulitpoint     */
-#define DN_DEV_BCAST  1
-#define DN_DEV_UCAST  2
-#define DN_DEV_MPOINT 4
-	int state;                /* Initial state                      */
-	int forwarding;	          /* 0=EndNode, 1=L1Router, 2=L2Router  */
-	unsigned long t2;         /* Default value of t2                */
-	unsigned long t3;         /* Default value of t3                */
-	int priority;             /* Priority to be a router            */
-	char *name;               /* Name for sysctl                    */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
-	void *sysctl;
-};
-
-
-struct dn_dev {
-	struct dn_ifaddr __rcu *ifa_list;
-	struct net_device *dev;
-	struct dn_dev_parms parms;
-	char use_long;
-	struct timer_list timer;
-	unsigned long t3;
-	struct neigh_parms *neigh_parms;
-	__u8 addr[ETH_ALEN];
-	struct neighbour *router; /* Default router on circuit */
-	struct neighbour *peer;   /* Peer on pointopoint links */
-	unsigned long uptime;     /* Time device went up in jiffies */
-};
-
-struct dn_short_packet {
-	__u8    msgflg;
-	__le16 dstnode;
-	__le16 srcnode;
-	__u8   forward;
-} __packed;
-
-struct dn_long_packet {
-	__u8   msgflg;
-	__u8   d_area;
-	__u8   d_subarea;
-	__u8   d_id[6];
-	__u8   s_area;
-	__u8   s_subarea;
-	__u8   s_id[6];
-	__u8   nl2;
-	__u8   visit_ct;
-	__u8   s_class;
-	__u8   pt;
-} __packed;
-
-/*------------------------- DRP - Routing messages ---------------------*/
-
-struct endnode_hello_message {
-	__u8   msgflg;
-	__u8   tiver[3];
-	__u8   id[6];
-	__u8   iinfo;
-	__le16 blksize;
-	__u8   area;
-	__u8   seed[8];
-	__u8   neighbor[6];
-	__le16 timer;
-	__u8   mpd;
-	__u8   datalen;
-	__u8   data[2];
-} __packed;
-
-struct rtnode_hello_message {
-	__u8   msgflg;
-	__u8   tiver[3];
-	__u8   id[6];
-	__u8   iinfo;
-	__le16  blksize;
-	__u8   priority;
-	__u8   area;
-	__le16  timer;
-	__u8   mpd;
-} __packed;
-
-
-void dn_dev_init(void);
-void dn_dev_cleanup(void);
-
-int dn_dev_ioctl(unsigned int cmd, void __user *arg);
-
-void dn_dev_devices_off(void);
-void dn_dev_devices_on(void);
-
-void dn_dev_init_pkt(struct sk_buff *skb);
-void dn_dev_veri_pkt(struct sk_buff *skb);
-void dn_dev_hello(struct sk_buff *skb);
-
-void dn_dev_up(struct net_device *);
-void dn_dev_down(struct net_device *);
-
-int dn_dev_set_default(struct net_device *dev, int force);
-struct net_device *dn_dev_get_default(void);
-int dn_dev_bind_default(__le16 *addr);
-
-int register_dnaddr_notifier(struct notifier_block *nb);
-int unregister_dnaddr_notifier(struct notifier_block *nb);
-
-static inline int dn_dev_islocal(struct net_device *dev, __le16 addr)
-{
-	struct dn_dev *dn_db;
-	struct dn_ifaddr *ifa;
-	int res = 0;
-
-	rcu_read_lock();
-	dn_db = rcu_dereference(dev->dn_ptr);
-	if (dn_db == NULL) {
-		printk(KERN_DEBUG "dn_dev_islocal: Called for non DECnet device\n");
-		goto out;
-	}
-
-	for (ifa = rcu_dereference(dn_db->ifa_list);
-	     ifa != NULL;
-	     ifa = rcu_dereference(ifa->ifa_next))
-		if ((addr ^ ifa->ifa_local) == 0) {
-			res = 1;
-			break;
-		}
-out:
-	rcu_read_unlock();
-	return res;
-}
-
-#endif /* _NET_DN_DEV_H */
diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
deleted file mode 100644
index 1929a3cd5ebe..000000000000
--- a/include/net/dn_fib.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_DN_FIB_H
-#define _NET_DN_FIB_H
-
-#include <linux/netlink.h>
-#include <linux/refcount.h>
-#include <linux/rtnetlink.h>
-#include <net/fib_rules.h>
-
-extern const struct nla_policy rtm_dn_policy[];
-
-struct dn_fib_res {
-	struct fib_rule *r;
-	struct dn_fib_info *fi;
-	unsigned char prefixlen;
-	unsigned char nh_sel;
-	unsigned char type;
-	unsigned char scope;
-};
-
-struct dn_fib_nh {
-	struct net_device	*nh_dev;
-	unsigned int		nh_flags;
-	unsigned char		nh_scope;
-	int			nh_weight;
-	int			nh_power;
-	int			nh_oif;
-	__le16			nh_gw;
-};
-
-struct dn_fib_info {
-	struct dn_fib_info	*fib_next;
-	struct dn_fib_info	*fib_prev;
-	refcount_t		fib_treeref;
-	refcount_t		fib_clntref;
-	int			fib_dead;
-	unsigned int		fib_flags;
-	int			fib_protocol;
-	__le16			fib_prefsrc;
-	__u32			fib_priority;
-	__u32			fib_metrics[RTAX_MAX];
-	int			fib_nhs;
-	int			fib_power;
-	struct dn_fib_nh	fib_nh[0];
-#define dn_fib_dev		fib_nh[0].nh_dev
-};
-
-
-#define DN_FIB_RES_RESET(res)	((res).nh_sel = 0)
-#define DN_FIB_RES_NH(res)	((res).fi->fib_nh[(res).nh_sel])
-
-#define DN_FIB_RES_PREFSRC(res)	((res).fi->fib_prefsrc ? : __dn_fib_res_prefsrc(&res))
-#define DN_FIB_RES_GW(res)	(DN_FIB_RES_NH(res).nh_gw)
-#define DN_FIB_RES_DEV(res)	(DN_FIB_RES_NH(res).nh_dev)
-#define DN_FIB_RES_OIF(res)	(DN_FIB_RES_NH(res).nh_oif)
-
-typedef struct {
-	__le16	datum;
-} dn_fib_key_t;
-
-typedef struct {
-	__le16	datum;
-} dn_fib_hash_t;
-
-typedef struct {
-	__u16	datum;
-} dn_fib_idx_t;
-
-struct dn_fib_node {
-	struct dn_fib_node *fn_next;
-	struct dn_fib_info *fn_info;
-#define DN_FIB_INFO(f) ((f)->fn_info)
-	dn_fib_key_t	fn_key;
-	u8		fn_type;
-	u8		fn_scope;
-	u8		fn_state;
-};
-
-
-struct dn_fib_table {
-	struct hlist_node hlist;
-	u32 n;
-
-	int (*insert)(struct dn_fib_table *t, struct rtmsg *r, 
-			struct nlattr *attrs[], struct nlmsghdr *n,
-			struct netlink_skb_parms *req);
-	int (*delete)(struct dn_fib_table *t, struct rtmsg *r,
-			struct nlattr *attrs[], struct nlmsghdr *n,
-			struct netlink_skb_parms *req);
-	int (*lookup)(struct dn_fib_table *t, const struct flowidn *fld,
-			struct dn_fib_res *res);
-	int (*flush)(struct dn_fib_table *t);
-	int (*dump)(struct dn_fib_table *t, struct sk_buff *skb, struct netlink_callback *cb);
-
-	unsigned char data[];
-};
-
-#ifdef CONFIG_DECNET_ROUTER
-/*
- * dn_fib.c
- */
-void dn_fib_init(void);
-void dn_fib_cleanup(void);
-
-int dn_fib_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
-struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r,
-				       struct nlattr *attrs[],
-				       const struct nlmsghdr *nlh, int *errp);
-int dn_fib_semantic_match(int type, struct dn_fib_info *fi,
-			  const struct flowidn *fld, struct dn_fib_res *res);
-void dn_fib_release_info(struct dn_fib_info *fi);
-void dn_fib_flush(void);
-void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res);
-
-/*
- * dn_tables.c
- */
-struct dn_fib_table *dn_fib_get_table(u32 n, int creat);
-struct dn_fib_table *dn_fib_empty_table(void);
-void dn_fib_table_init(void);
-void dn_fib_table_cleanup(void);
-
-/*
- * dn_rules.c
- */
-void dn_fib_rules_init(void);
-void dn_fib_rules_cleanup(void);
-unsigned int dnet_addr_type(__le16 addr);
-int dn_fib_lookup(struct flowidn *fld, struct dn_fib_res *res);
-
-int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb);
-
-void dn_fib_free_info(struct dn_fib_info *fi);
-
-static inline void dn_fib_info_put(struct dn_fib_info *fi)
-{
-	if (refcount_dec_and_test(&fi->fib_clntref))
-		dn_fib_free_info(fi);
-}
-
-static inline void dn_fib_res_put(struct dn_fib_res *res)
-{
-	if (res->fi)
-		dn_fib_info_put(res->fi);
-	if (res->r)
-		fib_rule_put(res->r);
-}
-
-#else /* Endnode */
-
-#define dn_fib_init()  do { } while(0)
-#define dn_fib_cleanup() do { } while(0)
-
-#define dn_fib_lookup(fl, res) (-ESRCH)
-#define dn_fib_info_put(fi) do { } while(0)
-#define dn_fib_select_multipath(fl, res) do { } while(0)
-#define dn_fib_rules_policy(saddr,res,flags) (0)
-#define dn_fib_res_put(res) do { } while(0)
-
-#endif /* CONFIG_DECNET_ROUTER */
-
-static inline __le16 dnet_make_mask(int n)
-{
-	if (n)
-		return cpu_to_le16(~((1 << (16 - n)) - 1));
-	return cpu_to_le16(0);
-}
-
-#endif /* _NET_DN_FIB_H */
diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h
deleted file mode 100644
index 1f7df98bfc33..000000000000
--- a/include/net/dn_neigh.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_DN_NEIGH_H
-#define _NET_DN_NEIGH_H
-
-#include <net/neighbour.h>
-
-/*
- * The position of the first two fields of
- * this structure are critical - SJW
- */
-struct dn_neigh {
-        struct neighbour n;
-	__le16 addr;
-        unsigned long flags;
-#define DN_NDFLAG_R1    0x0001 /* Router L1      */
-#define DN_NDFLAG_R2    0x0002 /* Router L2      */
-#define DN_NDFLAG_P3    0x0004 /* Phase III Node */
-        unsigned long blksize;
-	__u8 priority;
-};
-
-void dn_neigh_init(void);
-void dn_neigh_cleanup(void);
-int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb);
-int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb);
-void dn_neigh_pointopoint_hello(struct sk_buff *skb);
-int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n);
-int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb);
-
-extern struct neigh_table dn_neigh_table;
-
-#endif /* _NET_DN_NEIGH_H */
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
deleted file mode 100644
index a4a18fee0b7c..000000000000
--- a/include/net/dn_nsp.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _NET_DN_NSP_H
-#define _NET_DN_NSP_H
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-    
-*******************************************************************************/
-/* dn_nsp.c functions prototyping */
-#include <linux/atomic.h>
-#include <linux/types.h>
-#include <net/sock.h>
-
-struct sk_buff;
-struct sk_buff_head;
-
-void dn_nsp_send_data_ack(struct sock *sk);
-void dn_nsp_send_oth_ack(struct sock *sk);
-void dn_send_conn_ack(struct sock *sk);
-void dn_send_conn_conf(struct sock *sk, gfp_t gfp);
-void dn_nsp_send_disc(struct sock *sk, unsigned char type,
-		      unsigned short reason, gfp_t gfp);
-void dn_nsp_return_disc(struct sk_buff *skb, unsigned char type,
-			unsigned short reason);
-void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval);
-void dn_nsp_send_conninit(struct sock *sk, unsigned char flags);
-
-void dn_nsp_output(struct sock *sk);
-int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb,
-			    struct sk_buff_head *q, unsigned short acknum);
-void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, gfp_t gfp,
-		       int oob);
-unsigned long dn_nsp_persist(struct sock *sk);
-int dn_nsp_xmit_timeout(struct sock *sk);
-
-int dn_nsp_rx(struct sk_buff *);
-int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
-
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri);
-struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock,
-				  long timeo, int *err);
-
-#define NSP_REASON_OK 0		/* No error */
-#define NSP_REASON_NR 1		/* No resources */
-#define NSP_REASON_UN 2		/* Unrecognised node name */
-#define NSP_REASON_SD 3		/* Node shutting down */
-#define NSP_REASON_ID 4		/* Invalid destination end user */
-#define NSP_REASON_ER 5		/* End user lacks resources */
-#define NSP_REASON_OB 6		/* Object too busy */
-#define NSP_REASON_US 7		/* Unspecified error */
-#define NSP_REASON_TP 8		/* Third-Party abort */
-#define NSP_REASON_EA 9		/* End user has aborted the link */
-#define NSP_REASON_IF 10	/* Invalid node name format */
-#define NSP_REASON_LS 11	/* Local node shutdown */
-#define NSP_REASON_LL 32	/* Node lacks logical-link resources */
-#define NSP_REASON_LE 33	/* End user lacks logical-link resources */
-#define NSP_REASON_UR 34	/* Unacceptable RQSTRID or PASSWORD field */
-#define NSP_REASON_UA 36	/* Unacceptable ACCOUNT field */
-#define NSP_REASON_TM 38	/* End user timed out logical link */
-#define NSP_REASON_NU 39	/* Node unreachable */
-#define NSP_REASON_NL 41	/* No-link message */
-#define NSP_REASON_DC 42	/* Disconnect confirm */
-#define NSP_REASON_IO 43	/* Image data field overflow */
-
-#define NSP_DISCINIT 0x38
-#define NSP_DISCCONF 0x48
-
-/*------------------------- NSP - messages ------------------------------*/
-/* Data Messages */
-/*---------------*/
-
-/* Data Messages    (data segment/interrupt/link service)               */
-
-struct nsp_data_seg_msg {
-	__u8   msgflg;
-	__le16 dstaddr;
-	__le16 srcaddr;
-} __packed;
-
-struct nsp_data_opt_msg {
-	__le16 acknum;
-	__le16 segnum;
-	__le16 lsflgs;
-} __packed;
-
-struct nsp_data_opt_msg1 {
-	__le16 acknum;
-	__le16 segnum;
-} __packed;
-
-
-/* Acknowledgment Message (data/other data)                             */
-struct nsp_data_ack_msg {
-	__u8   msgflg;
-	__le16 dstaddr;
-	__le16 srcaddr;
-	__le16 acknum;
-} __packed;
-
-/* Connect Acknowledgment Message */
-struct  nsp_conn_ack_msg {
-	__u8 msgflg;
-	__le16 dstaddr;
-} __packed;
-
-
-/* Connect Initiate/Retransmit Initiate/Connect Confirm */
-struct  nsp_conn_init_msg {
-	__u8   msgflg;
-#define NSP_CI      0x18            /* Connect Initiate     */
-#define NSP_RCI     0x68            /* Retrans. Conn Init   */
-	__le16 dstaddr;
-	__le16 srcaddr;
-	__u8   services;
-#define NSP_FC_NONE   0x00            /* Flow Control None    */
-#define NSP_FC_SRC    0x04            /* Seg Req. Count       */
-#define NSP_FC_SCMC   0x08            /* Sess. Control Mess   */
-#define NSP_FC_MASK   0x0c            /* FC type mask         */
-	__u8   info;
-	__le16 segsize;
-} __packed;
-
-/* Disconnect Initiate/Disconnect Confirm */
-struct  nsp_disconn_init_msg {
-	__u8   msgflg;
-	__le16 dstaddr;
-	__le16 srcaddr;
-	__le16 reason;
-} __packed;
-
-
-
-struct  srcobj_fmt {
-	__u8   format;
-	__u8   task;
-	__le16 grpcode;
-	__le16 usrcode;
-	__u8   dlen;
-} __packed;
-
-/*
- * A collection of functions for manipulating the sequence
- * numbers used in NSP. Similar in operation to the functions
- * of the same name in TCP.
- */
-static __inline__ int dn_before(__u16 seq1, __u16 seq2)
-{
-        seq1 &= 0x0fff;
-        seq2 &= 0x0fff;
-
-        return (int)((seq1 - seq2) & 0x0fff) > 2048;
-}
-
-
-static __inline__ int dn_after(__u16 seq1, __u16 seq2)
-{
-        seq1 &= 0x0fff;
-        seq2 &= 0x0fff;
-
-        return (int)((seq2 - seq1) & 0x0fff) > 2048;
-}
-
-static __inline__ int dn_equal(__u16 seq1, __u16 seq2)
-{
-        return ((seq1 ^ seq2) & 0x0fff) == 0;
-}
-
-static __inline__ int dn_before_or_equal(__u16 seq1, __u16 seq2)
-{
-	return (dn_before(seq1, seq2) || dn_equal(seq1, seq2));
-}
-
-static __inline__ void seq_add(__u16 *seq, __u16 off)
-{
-        (*seq) += off;
-        (*seq) &= 0x0fff;
-}
-
-static __inline__ int seq_next(__u16 seq1, __u16 seq2)
-{
-	return dn_equal(seq1 + 1, seq2);
-}
-
-/*
- * Can we delay the ack ?
- */
-static __inline__ int sendack(__u16 seq)
-{
-        return (int)((seq & 0x1000) ? 0 : 1);
-}
-
-/*
- * Is socket congested ?
- */
-static __inline__ int dn_congested(struct sock *sk)
-{
-        return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
-}
-
-#define DN_MAX_NSP_DATA_HEADER (11)
-
-#endif /* _NET_DN_NSP_H */
diff --git a/include/net/dn_route.h b/include/net/dn_route.h
deleted file mode 100644
index 88c0300236cc..000000000000
--- a/include/net/dn_route.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _NET_DN_ROUTE_H
-#define _NET_DN_ROUTE_H
-
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-    
-*******************************************************************************/
-
-#include <linux/types.h>
-#include <net/dst.h>
-
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri);
-int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *,
-			 struct sock *sk, int flags);
-int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb);
-void dn_rt_cache_flush(int delay);
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev,
-		 struct packet_type *pt, struct net_device *orig_dev);
-
-/* Masks for flags field */
-#define DN_RT_F_PID 0x07 /* Mask for packet type                      */
-#define DN_RT_F_PF  0x80 /* Padding Follows                           */
-#define DN_RT_F_VER 0x40 /* Version =0 discard packet if ==1          */
-#define DN_RT_F_IE  0x20 /* Intra Ethernet, Reserved in short pkt     */
-#define DN_RT_F_RTS 0x10 /* Packet is being returned to sender        */
-#define DN_RT_F_RQR 0x08 /* Return packet to sender upon non-delivery */
-
-/* Mask for types of routing packets */
-#define DN_RT_PKT_MSK   0x06
-/* Types of routing packets */
-#define DN_RT_PKT_SHORT 0x02 /* Short routing packet */
-#define DN_RT_PKT_LONG  0x06 /* Long routing packet  */
-
-/* Mask for control/routing selection */
-#define DN_RT_PKT_CNTL  0x01 /* Set to 1 if a control packet  */
-/* Types of control packets */
-#define DN_RT_CNTL_MSK  0x0f /* Mask for control packets      */
-#define DN_RT_PKT_INIT  0x01 /* Initialisation packet         */
-#define DN_RT_PKT_VERI  0x03 /* Verification Message          */
-#define DN_RT_PKT_HELO  0x05 /* Hello and Test Message        */
-#define DN_RT_PKT_L1RT  0x07 /* Level 1 Routing Message       */
-#define DN_RT_PKT_L2RT  0x09 /* Level 2 Routing Message       */
-#define DN_RT_PKT_ERTH  0x0b /* Ethernet Router Hello         */
-#define DN_RT_PKT_EEDH  0x0d /* Ethernet EndNode Hello        */
-
-/* Values for info field in hello message */
-#define DN_RT_INFO_TYPE 0x03 /* Type mask                     */
-#define DN_RT_INFO_L1RT 0x02 /* L1 Router                     */
-#define DN_RT_INFO_L2RT 0x01 /* L2 Router                     */
-#define DN_RT_INFO_ENDN 0x03 /* EndNode                       */
-#define DN_RT_INFO_VERI 0x04 /* Verification Reqd.            */
-#define DN_RT_INFO_RJCT 0x08 /* Reject Flag, Reserved         */
-#define DN_RT_INFO_VFLD 0x10 /* Verification Failed, Reserved */
-#define DN_RT_INFO_NOML 0x20 /* No Multicast traffic accepted */
-#define DN_RT_INFO_BLKR 0x40 /* Blocking Requested            */
-
-/*
- * The fl structure is what we used to look up the route.
- * The rt_saddr & rt_daddr entries are the same as key.saddr & key.daddr
- * except for local input routes, where the rt_saddr = fl.fld_dst and
- * rt_daddr = fl.fld_src to allow the route to be used for returning
- * packets to the originating host.
- */
-struct dn_route {
-	struct dst_entry dst;
-	struct dn_route __rcu *dn_next;
-
-	struct neighbour *n;
-
-	struct flowidn fld;
-
-	__le16 rt_saddr;
-	__le16 rt_daddr;
-	__le16 rt_gateway;
-	__le16 rt_local_src;	/* Source used for forwarding packets */
-	__le16 rt_src_map;
-	__le16 rt_dst_map;
-
-	unsigned int rt_flags;
-	unsigned int rt_type;
-};
-
-static inline bool dn_is_input_route(struct dn_route *rt)
-{
-	return rt->fld.flowidn_iif != 0;
-}
-
-static inline bool dn_is_output_route(struct dn_route *rt)
-{
-	return rt->fld.flowidn_iif == 0;
-}
-
-void dn_route_init(void);
-void dn_route_cleanup(void);
-
-#include <net/sock.h>
-#include <linux/if_arp.h>
-
-static inline void dn_rt_send(struct sk_buff *skb)
-{
-	dev_queue_xmit(skb);
-}
-
-static inline void dn_rt_finish_output(struct sk_buff *skb, char *dst, char *src)
-{
-	struct net_device *dev = skb->dev;
-
-	if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
-		dst = NULL;
-
-	if (dev_hard_header(skb, dev, ETH_P_DNA_RT, dst, src, skb->len) >= 0)
-		dn_rt_send(skb);
-	else
-		kfree_skb(skb);
-}
-
-#endif /* _NET_DN_ROUTE_H */
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index b593f95e9991..02bbdc577f8e 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -24,9 +24,6 @@ struct netns_nf {
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
 #endif
-#if IS_ENABLED(CONFIG_DECNET)
-	struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS];
-#endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	unsigned int defrag_ipv4_users;
 #endif
diff --git a/include/uapi/linux/dn.h b/include/uapi/linux/dn.h
deleted file mode 100644
index 36ca71bd8bbe..000000000000
--- a/include/uapi/linux/dn.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _LINUX_DN_H
-#define _LINUX_DN_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include <linux/if_ether.h>
-
-/*
-
-	DECnet Data Structures and Constants
-
-*/
-
-/* 
- * DNPROTO_NSP can't be the same as SOL_SOCKET, 
- * so increment each by one (compared to ULTRIX)
- */
-#define DNPROTO_NSP     2                       /* NSP protocol number       */
-#define DNPROTO_ROU     3                       /* Routing protocol number   */
-#define DNPROTO_NML     4                       /* Net mgt protocol number   */
-#define DNPROTO_EVL     5                       /* Evl protocol number (usr) */
-#define DNPROTO_EVR     6                       /* Evl protocol number (evl) */
-#define DNPROTO_NSPT    7                       /* NSP trace protocol number */
-
-
-#define DN_ADDL		2
-#define DN_MAXADDL	2 /* ULTRIX headers have 20 here, but pathworks has 2 */
-#define DN_MAXOPTL	16
-#define DN_MAXOBJL	16
-#define DN_MAXACCL	40
-#define DN_MAXALIASL	128
-#define DN_MAXNODEL	256
-#define DNBUFSIZE	65023
-
-/* 
- * SET/GET Socket options  - must match the DSO_ numbers below
- */
-#define SO_CONDATA      1
-#define SO_CONACCESS    2
-#define SO_PROXYUSR     3
-#define SO_LINKINFO     7
-
-#define DSO_CONDATA     1        /* Set/Get connect data                */
-#define DSO_DISDATA     10       /* Set/Get disconnect data             */
-#define DSO_CONACCESS   2        /* Set/Get connect access data         */
-#define DSO_ACCEPTMODE  4        /* Set/Get accept mode                 */
-#define DSO_CONACCEPT   5        /* Accept deferred connection          */
-#define DSO_CONREJECT   6        /* Reject deferred connection          */
-#define DSO_LINKINFO    7        /* Set/Get link information            */
-#define DSO_STREAM      8        /* Set socket type to stream           */
-#define DSO_SEQPACKET   9        /* Set socket type to sequenced packet */
-#define DSO_MAXWINDOW   11       /* Maximum window size allowed         */
-#define DSO_NODELAY	12       /* Turn off nagle                      */
-#define DSO_CORK        13       /* Wait for more data!                 */
-#define DSO_SERVICES	14       /* NSP Services field                  */
-#define DSO_INFO	15       /* NSP Info field                      */
-#define DSO_MAX         15       /* Maximum option number               */
-
-
-/* LINK States */
-#define LL_INACTIVE	0
-#define LL_CONNECTING	1
-#define LL_RUNNING	2
-#define LL_DISCONNECTING 3
-
-#define ACC_IMMED 0
-#define ACC_DEFER 1
-
-#define SDF_WILD        1                  /* Wild card object          */
-#define SDF_PROXY       2                  /* Addr eligible for proxy   */
-#define SDF_UICPROXY    4                  /* Use uic-based proxy       */
-
-/* Structures */
-
-
-struct dn_naddr {
-	__le16		a_len;
-	__u8 a_addr[DN_MAXADDL]; /* Two bytes little endian */
-};
-
-struct sockaddr_dn {
-	__u16		sdn_family;
-	__u8		sdn_flags;
-	__u8		sdn_objnum;
-	__le16		sdn_objnamel;
-	__u8		sdn_objname[DN_MAXOBJL];
-	struct   dn_naddr	sdn_add;
-};
-#define sdn_nodeaddrl   sdn_add.a_len   /* Node address length  */
-#define sdn_nodeaddr    sdn_add.a_addr  /* Node address         */
-
-
-
-/*
- * DECnet set/get DSO_CONDATA, DSO_DISDATA (optional data) structure
- */
-struct optdata_dn {
-        __le16  opt_status;     /* Extended status return */
-#define opt_sts opt_status
-        __le16  opt_optl;       /* Length of user data    */
-        __u8   opt_data[16];   /* User data              */
-};
-
-struct accessdata_dn {
-	__u8		acc_accl;
-	__u8		acc_acc[DN_MAXACCL];
-	__u8 		acc_passl;
-	__u8		acc_pass[DN_MAXACCL];
-	__u8 		acc_userl;
-	__u8		acc_user[DN_MAXACCL];
-};
-
-/*
- * DECnet logical link information structure
- */
-struct linkinfo_dn {
-        __u16  idn_segsize;    /* Segment size for link */
-        __u8   idn_linkstate;  /* Logical link state    */
-};
-
-/*
- * Ethernet address format (for DECnet)
- */
-union etheraddress {
-        __u8 dne_addr[ETH_ALEN];      /* Full ethernet address */
-  struct {
-                __u8 dne_hiord[4];    /* DECnet HIORD prefix   */
-                __u8 dne_nodeaddr[2]; /* DECnet node address   */
-  } dne_remote;
-};
-
-
-/*
- * DECnet physical socket address format
- */
-struct dn_addr {
-        __le16 dna_family;      /* AF_DECnet               */
-        union etheraddress dna_netaddr; /* DECnet ethernet address */
-};
-
-#define DECNET_IOCTL_BASE 0x89 /* PROTOPRIVATE range */
-
-#define SIOCSNETADDR  _IOW(DECNET_IOCTL_BASE, 0xe0, struct dn_naddr)
-#define SIOCGNETADDR  _IOR(DECNET_IOCTL_BASE, 0xe1, struct dn_naddr)
-#define OSIOCSNETADDR _IOW(DECNET_IOCTL_BASE, 0xe0, int)
-#define OSIOCGNETADDR _IOR(DECNET_IOCTL_BASE, 0xe1, int)
-
-#endif /* _LINUX_DN_H */
diff --git a/include/uapi/linux/netfilter_decnet.h b/include/uapi/linux/netfilter_decnet.h
deleted file mode 100644
index 3c77f54560f2..000000000000
--- a/include/uapi/linux/netfilter_decnet.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_DECNET_NETFILTER_H
-#define __LINUX_DECNET_NETFILTER_H
-
-/* DECnet-specific defines for netfilter. 
- * This file (C) Steve Whitehouse 1999 derived from the
- * ipv4 netfilter header file which is
- * (C)1998 Rusty Russell -- This code is GPL.
- */
-
-#include <linux/netfilter.h>
-
-/* only for userspace compatibility */
-#ifndef __KERNEL__
-
-#include <limits.h> /* for INT_MIN, INT_MAX */
-
-/* kernel define is in netfilter_defs.h */
-#define NF_DN_NUMHOOKS		7
-#endif /* ! __KERNEL__ */
-
-/* DECnet Hooks */
-/* After promisc drops, checksum checks. */
-#define NF_DN_PRE_ROUTING	0
-/* If the packet is destined for this box. */
-#define NF_DN_LOCAL_IN		1
-/* If the packet is destined for another interface. */
-#define NF_DN_FORWARD		2
-/* Packets coming from a local process. */
-#define NF_DN_LOCAL_OUT		3
-/* Packets about to hit the wire. */
-#define NF_DN_POST_ROUTING	4
-/* Input Hello Packets */
-#define NF_DN_HELLO		5
-/* Input Routing Packets */
-#define NF_DN_ROUTE		6
-
-enum nf_dn_hook_priorities {
-	NF_DN_PRI_FIRST = INT_MIN,
-	NF_DN_PRI_CONNTRACK = -200,
-	NF_DN_PRI_MANGLE = -150,
-	NF_DN_PRI_NAT_DST = -100,
-	NF_DN_PRI_FILTER = 0,
-	NF_DN_PRI_NAT_SRC = 100,
-	NF_DN_PRI_DNRTMSG = 200,
-	NF_DN_PRI_LAST = INT_MAX,
-};
-
-struct nf_dn_rtmsg {
-	int nfdn_ifindex;
-};
-
-#define NFDN_RTMSG(r) ((unsigned char *)(r) + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)))
-
-#ifndef __KERNEL__
-/* backwards compatibility for userspace */
-#define DNRMG_L1_GROUP 0x01
-#define DNRMG_L2_GROUP 0x02
-#endif
-
-enum {
-	DNRNG_NLGRP_NONE,
-#define DNRNG_NLGRP_NONE	DNRNG_NLGRP_NONE
-	DNRNG_NLGRP_L1,
-#define DNRNG_NLGRP_L1		DNRNG_NLGRP_L1
-	DNRNG_NLGRP_L2,
-#define DNRNG_NLGRP_L2		DNRNG_NLGRP_L2
-	__DNRNG_NLGRP_MAX
-};
-#define DNRNG_NLGRP_MAX	(__DNRNG_NLGRP_MAX - 1)
-
-#endif /*__LINUX_DECNET_NETFILTER_H*/
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 855dffb4c1c3..1e543cf0568c 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -20,7 +20,7 @@
 #define NETLINK_CONNECTOR	11
 #define NETLINK_NETFILTER	12	/* netfilter subsystem */
 #define NETLINK_IP6_FW		13
-#define NETLINK_DNRTMSG		14	/* DECnet routing messages */
+#define NETLINK_DNRTMSG		14	/* DECnet routing messages (obsolete) */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
 #define NETLINK_GENERIC		16
 /* leave room for NETLINK_DM (DM Events) */
diff --git a/net/Kconfig b/net/Kconfig
index 6b78f695caa6..48c33c222199 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -204,7 +204,6 @@ config BRIDGE_NETFILTER
 source "net/netfilter/Kconfig"
 source "net/ipv4/netfilter/Kconfig"
 source "net/ipv6/netfilter/Kconfig"
-source "net/decnet/netfilter/Kconfig"
 source "net/bridge/netfilter/Kconfig"
 
 endif
@@ -221,7 +220,6 @@ source "net/802/Kconfig"
 source "net/bridge/Kconfig"
 source "net/dsa/Kconfig"
 source "net/8021q/Kconfig"
-source "net/decnet/Kconfig"
 source "net/llc/Kconfig"
 source "drivers/net/appletalk/Kconfig"
 source "net/x25/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index fbfeb8a0bb37..6a62e5b27378 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_AF_KCM)		+= kcm/
 obj-$(CONFIG_STREAM_PARSER)	+= strparser/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_L2TP)		+= l2tp/
-obj-$(CONFIG_DECNET)		+= decnet/
 obj-$(CONFIG_PHONET)		+= phonet/
 ifneq ($(CONFIG_VLAN_8021Q),)
 obj-y				+= 8021q/
diff --git a/net/core/dev.c b/net/core/dev.c
index 716df64fcfa5..6847022b9d66 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10370,9 +10370,7 @@ void netdev_run_todo(void)
 		BUG_ON(!list_empty(&dev->ptype_specific));
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
-#if IS_ENABLED(CONFIG_DECNET)
-		WARN_ON(dev->dn_ptr);
-#endif
+
 		if (dev->priv_destructor)
 			dev->priv_destructor(dev);
 		if (dev->needs_free_netdev)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5b669eb80270..833d2214f36f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1847,9 +1847,6 @@ static struct neigh_table *neigh_find_table(int family)
 	case AF_INET6:
 		tbl = neigh_tables[NEIGH_ND_TABLE];
 		break;
-	case AF_DECnet:
-		tbl = neigh_tables[NEIGH_DN_TABLE];
-		break;
 	}
 
 	return tbl;
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
deleted file mode 100644
index 24336bdb1054..000000000000
--- a/net/decnet/Kconfig
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# DECnet configuration
-#
-config DECNET
-	tristate "DECnet Support"
-	help
-	  The DECnet networking protocol was used in many products made by
-	  Digital (now Compaq).  It provides reliable stream and sequenced
-	  packet communications over which run a variety of services similar
-	  to those which run over TCP/IP.
-
-	  To find some tools to use with the kernel layer support, please
-	  look at Patrick Caulfield's web site:
-	  <http://linux-decnet.sourceforge.net/>.
-
-	  More detailed documentation is available in
-	  <file:Documentation/networking/decnet.rst>.
-
-	  Be sure to say Y to "/proc file system support" and "Sysctl support"
-	  below when using DECnet, since you will need sysctl support to aid
-	  in configuration at run time.
-
-	  The DECnet code is also available as a module ( = code which can be
-	  inserted in and removed from the running kernel whenever you want).
-	  The module is called decnet.
-
-config DECNET_ROUTER
-	bool "DECnet: router support"
-	depends on DECNET
-	select FIB_RULES
-	help
-	  Add support for turning your DECnet Endnode into a level 1 or 2
-	  router.  This is an experimental, but functional option.  If you
-	  do say Y here, then make sure that you also say Y to "Kernel/User
-	  network link driver", "Routing messages" and "Network packet
-	  filtering".  The first two are required to allow configuration via
-	  rtnetlink (you will need Alexey Kuznetsov's iproute2 package
-	  from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet
-	  filtering" option will be required for the forthcoming routing daemon
-	  to work.
-
-	  See <file:Documentation/networking/decnet.rst> for more information.
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
deleted file mode 100644
index 07b38e441b2d..000000000000
--- a/net/decnet/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_DECNET) += decnet.o
-
-decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \
-	    dn_route.o dn_dev.o dn_neigh.o dn_timer.o
-decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
-decnet-y += sysctl_net_decnet.o
-
-obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/decnet/README b/net/decnet/README
deleted file mode 100644
index 60e7ec88c81f..000000000000
--- a/net/decnet/README
+++ /dev/null
@@ -1,8 +0,0 @@
-                       Linux DECnet Project
-                      ======================
-
-The documentation for this kernel subsystem is available in the
-Documentation/networking subdirectory of this distribution and also
-on line at http://www.chygwyn.com/DECnet/
-
-Steve Whitehouse <SteveW@ACM.org>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
deleted file mode 100644
index 6582dfdfb932..000000000000
--- a/net/decnet/af_decnet.c
+++ /dev/null
@@ -1,2404 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Socket Layer Interface
- *
- * Authors:     Eduardo Marcelo Serrat <emserrat@geocities.com>
- *              Patrick Caulfield <patrick@pandh.demon.co.uk>
- *
- * Changes:
- *        Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's
- *                          version of the code. Original copyright preserved
- *                          below.
- *        Steve Whitehouse: Some bug fixes, cleaning up some code to make it
- *                          compatible with my routing layer.
- *        Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick
- *                          Caulfield.
- *        Steve Whitehouse: Further bug fixes, checking module code still works
- *                          with new routing layer.
- *        Steve Whitehouse: Additional set/get_sockopt() calls.
- *        Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new
- *                          code.
- *        Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like
- *                          way. Didn't manage it entirely, but its better.
- *        Steve Whitehouse: ditto for sendmsg().
- *        Steve Whitehouse: A selection of bug fixes to various things.
- *        Steve Whitehouse: Added TIOCOUTQ ioctl.
- *        Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username.
- *        Steve Whitehouse: Fixes to connect() error returns.
- *       Patrick Caulfield: Fixes to delayed acceptance logic.
- *         David S. Miller: New socket locking
- *        Steve Whitehouse: Socket list hashing/locking
- *         Arnaldo C. Melo: use capable, not suser
- *        Steve Whitehouse: Removed unused code. Fix to use sk->allocation
- *                          when required.
- *       Patrick Caulfield: /proc/net/decnet now has object name/number
- *        Steve Whitehouse: Fixed local port allocation, hashed sk list
- *          Matthew Wilcox: Fixes for dn_ioctl()
- *        Steve Whitehouse: New connect/accept logic to allow timeouts and
- *                          prepare for sendpage etc.
- */
-
-
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-
-
-HISTORY:
-
-Version           Kernel     Date       Author/Comments
--------           ------     ----       ---------------
-Version 0.0.1     2.0.30    01-dic-97	Eduardo Marcelo Serrat
-					(emserrat@geocities.com)
-
-					First Development of DECnet Socket La-
-					yer for Linux. Only supports outgoing
-					connections.
-
-Version 0.0.2	  2.1.105   20-jun-98   Patrick J. Caulfield
-					(patrick@pandh.demon.co.uk)
-
-					Port to new kernel development version.
-
-Version 0.0.3     2.1.106   25-jun-98   Eduardo Marcelo Serrat
-					(emserrat@geocities.com)
-					_
-					Added support for incoming connections
-					so we can start developing server apps
-					on Linux.
-					-
-					Module Support
-Version 0.0.4     2.1.109   21-jul-98   Eduardo Marcelo Serrat
-				       (emserrat@geocities.com)
-				       _
-					Added support for X11R6.4. Now we can
-					use DECnet transport for X on Linux!!!
-				       -
-Version 0.0.5    2.1.110   01-aug-98   Eduardo Marcelo Serrat
-				       (emserrat@geocities.com)
-				       Removed bugs on flow control
-				       Removed bugs on incoming accessdata
-				       order
-				       -
-Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
-				       dn_recvmsg fixes
-
-					Patrick J. Caulfield
-				       dn_bind fixes
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/netfilter.h>
-#include <linux/seq_file.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <net/flow.h>
-#include <asm/ioctls.h>
-#include <linux/capability.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/jiffies.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/fib_rules.h>
-#include <net/tcp.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-
-struct dn_sock {
-	struct sock sk;
-	struct dn_scp scp;
-};
-
-static void dn_keepalive(struct sock *sk);
-
-#define DN_SK_HASH_SHIFT 8
-#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT)
-#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
-
-
-static const struct proto_ops dn_proto_ops;
-static DEFINE_RWLOCK(dn_hash_lock);
-static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
-static struct hlist_head dn_wild_sk;
-static atomic_long_t decnet_memory_allocated;
-static DEFINE_PER_CPU(int, decnet_memory_per_cpu_fw_alloc);
-
-static int __dn_setsockopt(struct socket *sock, int level, int optname,
-		sockptr_t optval, unsigned int optlen, int flags);
-static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
-
-static struct hlist_head *dn_find_list(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->addr.sdn_flags & SDF_WILD)
-		return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL;
-
-	return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK];
-}
-
-/*
- * Valid ports are those greater than zero and not already in use.
- */
-static int check_port(__le16 port)
-{
-	struct sock *sk;
-
-	if (port == 0)
-		return -1;
-
-	sk_for_each(sk, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) {
-		struct dn_scp *scp = DN_SK(sk);
-		if (scp->addrloc == port)
-			return -1;
-	}
-	return 0;
-}
-
-static unsigned short port_alloc(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	static unsigned short port = 0x2000;
-	unsigned short i_port = port;
-
-	while(check_port(cpu_to_le16(++port)) != 0) {
-		if (port == i_port)
-			return 0;
-	}
-
-	scp->addrloc = cpu_to_le16(port);
-
-	return 1;
-}
-
-/*
- * Since this is only ever called from user
- * level, we don't need a write_lock() version
- * of this.
- */
-static int dn_hash_sock(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct hlist_head *list;
-	int rv = -EUSERS;
-
-	BUG_ON(sk_hashed(sk));
-
-	write_lock_bh(&dn_hash_lock);
-
-	if (!scp->addrloc && !port_alloc(sk))
-		goto out;
-
-	rv = -EADDRINUSE;
-	if ((list = dn_find_list(sk)) == NULL)
-		goto out;
-
-	sk_add_node(sk, list);
-	rv = 0;
-out:
-	write_unlock_bh(&dn_hash_lock);
-	return rv;
-}
-
-static void dn_unhash_sock(struct sock *sk)
-{
-	write_lock(&dn_hash_lock);
-	sk_del_node_init(sk);
-	write_unlock(&dn_hash_lock);
-}
-
-static void dn_unhash_sock_bh(struct sock *sk)
-{
-	write_lock_bh(&dn_hash_lock);
-	sk_del_node_init(sk);
-	write_unlock_bh(&dn_hash_lock);
-}
-
-static struct hlist_head *listen_hash(struct sockaddr_dn *addr)
-{
-	int i;
-	unsigned int hash = addr->sdn_objnum;
-
-	if (hash == 0) {
-		hash = addr->sdn_objnamel;
-		for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) {
-			hash ^= addr->sdn_objname[i];
-			hash ^= (hash << 3);
-		}
-	}
-
-	return &dn_sk_hash[hash & DN_SK_HASH_MASK];
-}
-
-/*
- * Called to transform a socket from bound (i.e. with a local address)
- * into a listening socket (doesn't need a local port number) and rehashes
- * based upon the object name/number.
- */
-static void dn_rehash_sock(struct sock *sk)
-{
-	struct hlist_head *list;
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->addr.sdn_flags & SDF_WILD)
-		return;
-
-	write_lock_bh(&dn_hash_lock);
-	sk_del_node_init(sk);
-	DN_SK(sk)->addrloc = 0;
-	list = listen_hash(&DN_SK(sk)->addr);
-	sk_add_node(sk, list);
-	write_unlock_bh(&dn_hash_lock);
-}
-
-int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type)
-{
-	int len = 2;
-
-	*buf++ = type;
-
-	switch (type) {
-	case 0:
-		*buf++ = sdn->sdn_objnum;
-		break;
-	case 1:
-		*buf++ = 0;
-		*buf++ = le16_to_cpu(sdn->sdn_objnamel);
-		memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
-		len = 3 + le16_to_cpu(sdn->sdn_objnamel);
-		break;
-	case 2:
-		memset(buf, 0, 5);
-		buf += 5;
-		*buf++ = le16_to_cpu(sdn->sdn_objnamel);
-		memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
-		len = 7 + le16_to_cpu(sdn->sdn_objnamel);
-		break;
-	}
-
-	return len;
-}
-
-/*
- * On reception of usernames, we handle types 1 and 0 for destination
- * addresses only. Types 2 and 4 are used for source addresses, but the
- * UIC, GIC are ignored and they are both treated the same way. Type 3
- * is never used as I've no idea what its purpose might be or what its
- * format is.
- */
-int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt)
-{
-	unsigned char type;
-	int size = len;
-	int namel = 12;
-
-	sdn->sdn_objnum = 0;
-	sdn->sdn_objnamel = cpu_to_le16(0);
-	memset(sdn->sdn_objname, 0, DN_MAXOBJL);
-
-	if (len < 2)
-		return -1;
-
-	len -= 2;
-	*fmt = *data++;
-	type = *data++;
-
-	switch (*fmt) {
-	case 0:
-		sdn->sdn_objnum = type;
-		return 2;
-	case 1:
-		namel = 16;
-		break;
-	case 2:
-		len  -= 4;
-		data += 4;
-		break;
-	case 4:
-		len  -= 8;
-		data += 8;
-		break;
-	default:
-		return -1;
-	}
-
-	len -= 1;
-
-	if (len < 0)
-		return -1;
-
-	sdn->sdn_objnamel = cpu_to_le16(*data++);
-	len -= le16_to_cpu(sdn->sdn_objnamel);
-
-	if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel))
-		return -1;
-
-	memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel));
-
-	return size - len;
-}
-
-struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr)
-{
-	struct hlist_head *list = listen_hash(addr);
-	struct sock *sk;
-
-	read_lock(&dn_hash_lock);
-	sk_for_each(sk, list) {
-		struct dn_scp *scp = DN_SK(sk);
-		if (sk->sk_state != TCP_LISTEN)
-			continue;
-		if (scp->addr.sdn_objnum) {
-			if (scp->addr.sdn_objnum != addr->sdn_objnum)
-				continue;
-		} else {
-			if (addr->sdn_objnum)
-				continue;
-			if (scp->addr.sdn_objnamel != addr->sdn_objnamel)
-				continue;
-			if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0)
-				continue;
-		}
-		sock_hold(sk);
-		read_unlock(&dn_hash_lock);
-		return sk;
-	}
-
-	sk = sk_head(&dn_wild_sk);
-	if (sk) {
-		if (sk->sk_state == TCP_LISTEN)
-			sock_hold(sk);
-		else
-			sk = NULL;
-	}
-
-	read_unlock(&dn_hash_lock);
-	return sk;
-}
-
-struct sock *dn_find_by_skb(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct sock *sk;
-	struct dn_scp *scp;
-
-	read_lock(&dn_hash_lock);
-	sk_for_each(sk, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) {
-		scp = DN_SK(sk);
-		if (cb->src != dn_saddr2dn(&scp->peer))
-			continue;
-		if (cb->dst_port != scp->addrloc)
-			continue;
-		if (scp->addrrem && (cb->src_port != scp->addrrem))
-			continue;
-		sock_hold(sk);
-		goto found;
-	}
-	sk = NULL;
-found:
-	read_unlock(&dn_hash_lock);
-	return sk;
-}
-
-
-
-static void dn_destruct(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	skb_queue_purge(&scp->data_xmit_queue);
-	skb_queue_purge(&scp->other_xmit_queue);
-	skb_queue_purge(&scp->other_receive_queue);
-
-	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
-}
-
-static unsigned long dn_memory_pressure;
-
-static void dn_enter_memory_pressure(struct sock *sk)
-{
-	if (!dn_memory_pressure) {
-		dn_memory_pressure = 1;
-	}
-}
-
-static struct proto dn_proto = {
-	.name			= "NSP",
-	.owner			= THIS_MODULE,
-	.enter_memory_pressure	= dn_enter_memory_pressure,
-	.memory_pressure	= &dn_memory_pressure,
-
-	.memory_allocated	= &decnet_memory_allocated,
-	.per_cpu_fw_alloc	= &decnet_memory_per_cpu_fw_alloc,
-
-	.sysctl_mem		= sysctl_decnet_mem,
-	.sysctl_wmem		= sysctl_decnet_wmem,
-	.sysctl_rmem		= sysctl_decnet_rmem,
-	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
-	.obj_size		= sizeof(struct dn_sock),
-};
-
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
-{
-	struct dn_scp *scp;
-	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
-
-	if  (!sk)
-		goto out;
-
-	if (sock)
-		sock->ops = &dn_proto_ops;
-	sock_init_data(sock, sk);
-
-	sk->sk_backlog_rcv = dn_nsp_backlog_rcv;
-	sk->sk_destruct    = dn_destruct;
-	sk->sk_no_check_tx = 1;
-	sk->sk_family      = PF_DECnet;
-	sk->sk_protocol    = 0;
-	sk->sk_allocation  = gfp;
-	sk->sk_sndbuf	   = READ_ONCE(sysctl_decnet_wmem[1]);
-	sk->sk_rcvbuf	   = READ_ONCE(sysctl_decnet_rmem[1]);
-
-	/* Initialization of DECnet Session Control Port		*/
-	scp = DN_SK(sk);
-	scp->state	= DN_O;		/* Open			*/
-	scp->numdat	= 1;		/* Next data seg to tx	*/
-	scp->numoth	= 1;		/* Next oth data to tx  */
-	scp->ackxmt_dat = 0;		/* Last data seg ack'ed */
-	scp->ackxmt_oth = 0;		/* Last oth data ack'ed */
-	scp->ackrcv_dat = 0;		/* Highest data ack recv*/
-	scp->ackrcv_oth = 0;		/* Last oth data ack rec*/
-	scp->flowrem_sw = DN_SEND;
-	scp->flowloc_sw = DN_SEND;
-	scp->flowrem_dat = 0;
-	scp->flowrem_oth = 1;
-	scp->flowloc_dat = 0;
-	scp->flowloc_oth = 1;
-	scp->services_rem = 0;
-	scp->services_loc = 1 | NSP_FC_NONE;
-	scp->info_rem = 0;
-	scp->info_loc = 0x03; /* NSP version 4.1 */
-	scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */
-	scp->nonagle = 0;
-	scp->multi_ireq = 1;
-	scp->accept_mode = ACC_IMMED;
-	scp->addr.sdn_family    = AF_DECnet;
-	scp->peer.sdn_family    = AF_DECnet;
-	scp->accessdata.acc_accl = 5;
-	memcpy(scp->accessdata.acc_acc, "LINUX", 5);
-
-	scp->max_window   = NSP_MAX_WINDOW;
-	scp->snd_window   = NSP_MIN_WINDOW;
-	scp->nsp_srtt     = NSP_INITIAL_SRTT;
-	scp->nsp_rttvar   = NSP_INITIAL_RTTVAR;
-	scp->nsp_rxtshift = 0;
-
-	skb_queue_head_init(&scp->data_xmit_queue);
-	skb_queue_head_init(&scp->other_xmit_queue);
-	skb_queue_head_init(&scp->other_receive_queue);
-
-	scp->persist = 0;
-	scp->persist_fxn = NULL;
-	scp->keepalive = 10 * HZ;
-	scp->keepalive_fxn = dn_keepalive;
-
-	dn_start_slow_timer(sk);
-out:
-	return sk;
-}
-
-/*
- * Keepalive timer.
- * FIXME: Should respond to SO_KEEPALIVE etc.
- */
-static void dn_keepalive(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	/*
-	 * By checking the other_data transmit queue is empty
-	 * we are double checking that we are not sending too
-	 * many of these keepalive frames.
-	 */
-	if (skb_queue_empty(&scp->other_xmit_queue))
-		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
-}
-
-
-/*
- * Timer for shutdown/destroyed sockets.
- * When socket is dead & no packets have been sent for a
- * certain amount of time, they are removed by this
- * routine. Also takes care of sending out DI & DC
- * frames at correct times.
- */
-int dn_destroy_timer(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	scp->persist = dn_nsp_persist(sk);
-
-	switch (scp->state) {
-	case DN_DI:
-		dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
-		if (scp->nsp_rxtshift >= decnet_di_count)
-			scp->state = DN_CN;
-		return 0;
-
-	case DN_DR:
-		dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
-		if (scp->nsp_rxtshift >= decnet_dr_count)
-			scp->state = DN_DRC;
-		return 0;
-
-	case DN_DN:
-		if (scp->nsp_rxtshift < decnet_dn_count) {
-			/* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */
-			dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
-					 GFP_ATOMIC);
-			return 0;
-		}
-	}
-
-	scp->persist = (HZ * decnet_time_wait);
-
-	if (sk->sk_socket)
-		return 0;
-
-	if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) {
-		dn_unhash_sock(sk);
-		sock_put(sk);
-		return 1;
-	}
-
-	return 0;
-}
-
-static void dn_destroy_sock(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	scp->nsp_rxtshift = 0; /* reset back off */
-
-	if (sk->sk_socket) {
-		if (sk->sk_socket->state != SS_UNCONNECTED)
-			sk->sk_socket->state = SS_DISCONNECTING;
-	}
-
-	sk->sk_state = TCP_CLOSE;
-
-	switch (scp->state) {
-	case DN_DN:
-		dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
-				 sk->sk_allocation);
-		scp->persist_fxn = dn_destroy_timer;
-		scp->persist = dn_nsp_persist(sk);
-		break;
-	case DN_CR:
-		scp->state = DN_DR;
-		goto disc_reject;
-	case DN_RUN:
-		scp->state = DN_DI;
-		fallthrough;
-	case DN_DI:
-	case DN_DR:
-disc_reject:
-		dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation);
-		fallthrough;
-	case DN_NC:
-	case DN_NR:
-	case DN_RJ:
-	case DN_DIC:
-	case DN_CN:
-	case DN_DRC:
-	case DN_CI:
-	case DN_CD:
-		scp->persist_fxn = dn_destroy_timer;
-		scp->persist = dn_nsp_persist(sk);
-		break;
-	default:
-		printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n");
-		fallthrough;
-	case DN_O:
-		dn_stop_slow_timer(sk);
-
-		dn_unhash_sock_bh(sk);
-		sock_put(sk);
-
-		break;
-	}
-}
-
-char *dn_addr2asc(__u16 addr, char *buf)
-{
-	unsigned short node, area;
-
-	node = addr & 0x03ff;
-	area = addr >> 10;
-	sprintf(buf, "%hd.%hd", area, node);
-
-	return buf;
-}
-
-
-
-static int dn_create(struct net *net, struct socket *sock, int protocol,
-		     int kern)
-{
-	struct sock *sk;
-
-	if (protocol < 0 || protocol > U8_MAX)
-		return -EINVAL;
-
-	if (!net_eq(net, &init_net))
-		return -EAFNOSUPPORT;
-
-	switch (sock->type) {
-	case SOCK_SEQPACKET:
-		if (protocol != DNPROTO_NSP)
-			return -EPROTONOSUPPORT;
-		break;
-	case SOCK_STREAM:
-		break;
-	default:
-		return -ESOCKTNOSUPPORT;
-	}
-
-
-	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
-		return -ENOBUFS;
-
-	sk->sk_protocol = protocol;
-
-	return 0;
-}
-
-
-static int
-dn_release(struct socket *sock)
-{
-	struct sock *sk = sock->sk;
-
-	if (sk) {
-		sock_orphan(sk);
-		sock_hold(sk);
-		lock_sock(sk);
-		dn_destroy_sock(sk);
-		release_sock(sk);
-		sock_put(sk);
-	}
-
-	return 0;
-}
-
-static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr;
-	struct net_device *dev, *ldev;
-	int rv;
-
-	if (addr_len != sizeof(struct sockaddr_dn))
-		return -EINVAL;
-
-	if (saddr->sdn_family != AF_DECnet)
-		return -EINVAL;
-
-	if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2))
-		return -EINVAL;
-
-	if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL)
-		return -EINVAL;
-
-	if (saddr->sdn_flags & ~SDF_WILD)
-		return -EINVAL;
-
-	if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
-	    (saddr->sdn_flags & SDF_WILD)))
-		return -EACCES;
-
-	if (!(saddr->sdn_flags & SDF_WILD)) {
-		if (le16_to_cpu(saddr->sdn_nodeaddrl)) {
-			rcu_read_lock();
-			ldev = NULL;
-			for_each_netdev_rcu(&init_net, dev) {
-				if (!dev->dn_ptr)
-					continue;
-				if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) {
-					ldev = dev;
-					break;
-				}
-			}
-			rcu_read_unlock();
-			if (ldev == NULL)
-				return -EADDRNOTAVAIL;
-		}
-	}
-
-	rv = -EINVAL;
-	lock_sock(sk);
-	if (sock_flag(sk, SOCK_ZAPPED)) {
-		memcpy(&scp->addr, saddr, addr_len);
-		sock_reset_flag(sk, SOCK_ZAPPED);
-
-		rv = dn_hash_sock(sk);
-		if (rv)
-			sock_set_flag(sk, SOCK_ZAPPED);
-	}
-	release_sock(sk);
-
-	return rv;
-}
-
-
-static int dn_auto_bind(struct socket *sock)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	int rv;
-
-	sock_reset_flag(sk, SOCK_ZAPPED);
-
-	scp->addr.sdn_flags  = 0;
-	scp->addr.sdn_objnum = 0;
-
-	/*
-	 * This stuff is to keep compatibility with Eduardo's
-	 * patch. I hope I can dispense with it shortly...
-	 */
-	if ((scp->accessdata.acc_accl != 0) &&
-		(scp->accessdata.acc_accl <= 12)) {
-
-		scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl);
-		memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel));
-
-		scp->accessdata.acc_accl = 0;
-		memset(scp->accessdata.acc_acc, 0, 40);
-	}
-	/* End of compatibility stuff */
-
-	scp->addr.sdn_add.a_len = cpu_to_le16(2);
-	rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
-	if (rv == 0) {
-		rv = dn_hash_sock(sk);
-		if (rv)
-			sock_set_flag(sk, SOCK_ZAPPED);
-	}
-
-	return rv;
-}
-
-static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int err;
-
-	if (scp->state != DN_CR)
-		return -EINVAL;
-
-	scp->state = DN_CC;
-	scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk));
-	dn_send_conn_conf(sk, allocation);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for(;;) {
-		release_sock(sk);
-		if (scp->state == DN_CC)
-			*timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo);
-		lock_sock(sk);
-		err = 0;
-		if (scp->state == DN_RUN)
-			break;
-		err = sock_error(sk);
-		if (err)
-			break;
-		err = sock_intr_errno(*timeo);
-		if (signal_pending(current))
-			break;
-		err = -EAGAIN;
-		if (!*timeo)
-			break;
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-	if (err == 0) {
-		sk->sk_socket->state = SS_CONNECTED;
-	} else if (scp->state != DN_CC) {
-		sk->sk_socket->state = SS_UNCONNECTED;
-	}
-	return err;
-}
-
-static int dn_wait_run(struct sock *sk, long *timeo)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int err = 0;
-
-	if (scp->state == DN_RUN)
-		goto out;
-
-	if (!*timeo)
-		return -EALREADY;
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for(;;) {
-		release_sock(sk);
-		if (scp->state == DN_CI || scp->state == DN_CC)
-			*timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo);
-		lock_sock(sk);
-		err = 0;
-		if (scp->state == DN_RUN)
-			break;
-		err = sock_error(sk);
-		if (err)
-			break;
-		err = sock_intr_errno(*timeo);
-		if (signal_pending(current))
-			break;
-		err = -ETIMEDOUT;
-		if (!*timeo)
-			break;
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-out:
-	if (err == 0) {
-		sk->sk_socket->state = SS_CONNECTED;
-	} else if (scp->state != DN_CI && scp->state != DN_CC) {
-		sk->sk_socket->state = SS_UNCONNECTED;
-	}
-	return err;
-}
-
-static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
-	struct socket *sock = sk->sk_socket;
-	struct dn_scp *scp = DN_SK(sk);
-	int err = -EISCONN;
-	struct flowidn fld;
-	struct dst_entry *dst;
-
-	if (sock->state == SS_CONNECTED)
-		goto out;
-
-	if (sock->state == SS_CONNECTING) {
-		err = 0;
-		if (scp->state == DN_RUN) {
-			sock->state = SS_CONNECTED;
-			goto out;
-		}
-		err = -ECONNREFUSED;
-		if (scp->state != DN_CI && scp->state != DN_CC) {
-			sock->state = SS_UNCONNECTED;
-			goto out;
-		}
-		return dn_wait_run(sk, timeo);
-	}
-
-	err = -EINVAL;
-	if (scp->state != DN_O)
-		goto out;
-
-	if (addr == NULL || addrlen != sizeof(struct sockaddr_dn))
-		goto out;
-	if (addr->sdn_family != AF_DECnet)
-		goto out;
-	if (addr->sdn_flags & SDF_WILD)
-		goto out;
-
-	if (sock_flag(sk, SOCK_ZAPPED)) {
-		err = dn_auto_bind(sk->sk_socket);
-		if (err)
-			goto out;
-	}
-
-	memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
-
-	err = -EHOSTUNREACH;
-	memset(&fld, 0, sizeof(fld));
-	fld.flowidn_oif = sk->sk_bound_dev_if;
-	fld.daddr = dn_saddr2dn(&scp->peer);
-	fld.saddr = dn_saddr2dn(&scp->addr);
-	dn_sk_ports_copy(&fld, scp);
-	fld.flowidn_proto = DNPROTO_NSP;
-	if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0)
-		goto out;
-	dst = __sk_dst_get(sk);
-	sk->sk_route_caps = dst->dev->features;
-	sock->state = SS_CONNECTING;
-	scp->state = DN_CI;
-	scp->segsize_loc = dst_metric_advmss(dst);
-
-	dn_nsp_send_conninit(sk, NSP_CI);
-	err = -EINPROGRESS;
-	if (*timeo) {
-		err = dn_wait_run(sk, timeo);
-	}
-out:
-	return err;
-}
-
-static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags)
-{
-	struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr;
-	struct sock *sk = sock->sk;
-	int err;
-	long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
-
-	lock_sock(sk);
-	err = __dn_connect(sk, addr, addrlen, &timeo, 0);
-	release_sock(sk);
-
-	return err;
-}
-
-static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	switch (scp->state) {
-	case DN_RUN:
-		return 0;
-	case DN_CR:
-		return dn_confirm_accept(sk, timeo, sk->sk_allocation);
-	case DN_CI:
-	case DN_CC:
-		return dn_wait_run(sk, timeo);
-	case DN_O:
-		return __dn_connect(sk, addr, addrlen, timeo, flags);
-	}
-
-	return -EINVAL;
-}
-
-
-static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc)
-{
-	unsigned char *ptr = skb->data;
-
-	acc->acc_userl = *ptr++;
-	memcpy(&acc->acc_user, ptr, acc->acc_userl);
-	ptr += acc->acc_userl;
-
-	acc->acc_passl = *ptr++;
-	memcpy(&acc->acc_pass, ptr, acc->acc_passl);
-	ptr += acc->acc_passl;
-
-	acc->acc_accl = *ptr++;
-	memcpy(&acc->acc_acc, ptr, acc->acc_accl);
-
-	skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3);
-
-}
-
-static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
-{
-	unsigned char *ptr = skb->data;
-	u16 len = *ptr++; /* yes, it's 8bit on the wire */
-
-	BUG_ON(len > 16); /* we've checked the contents earlier */
-	opt->opt_optl   = cpu_to_le16(len);
-	opt->opt_status = 0;
-	memcpy(opt->opt_data, ptr, len);
-	skb_pull(skb, len + 1);
-}
-
-static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct sk_buff *skb = NULL;
-	int err = 0;
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for(;;) {
-		release_sock(sk);
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		if (skb == NULL) {
-			*timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo);
-			skb = skb_dequeue(&sk->sk_receive_queue);
-		}
-		lock_sock(sk);
-		if (skb != NULL)
-			break;
-		err = -EINVAL;
-		if (sk->sk_state != TCP_LISTEN)
-			break;
-		err = sock_intr_errno(*timeo);
-		if (signal_pending(current))
-			break;
-		err = -EAGAIN;
-		if (!*timeo)
-			break;
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	return skb == NULL ? ERR_PTR(err) : skb;
-}
-
-static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
-		     bool kern)
-{
-	struct sock *sk = sock->sk, *newsk;
-	struct sk_buff *skb = NULL;
-	struct dn_skb_cb *cb;
-	unsigned char menuver;
-	int err = 0;
-	unsigned char type;
-	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-	struct dst_entry *dst;
-
-	lock_sock(sk);
-
-	if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) {
-		release_sock(sk);
-		return -EINVAL;
-	}
-
-	skb = skb_dequeue(&sk->sk_receive_queue);
-	if (skb == NULL) {
-		skb = dn_wait_for_connect(sk, &timeo);
-		if (IS_ERR(skb)) {
-			release_sock(sk);
-			return PTR_ERR(skb);
-		}
-	}
-
-	cb = DN_SKB_CB(skb);
-	sk_acceptq_removed(sk);
-	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
-	if (newsk == NULL) {
-		release_sock(sk);
-		kfree_skb(skb);
-		return -ENOBUFS;
-	}
-	release_sock(sk);
-
-	dst = skb_dst(skb);
-	sk_dst_set(newsk, dst);
-	skb_dst_set(skb, NULL);
-
-	DN_SK(newsk)->state        = DN_CR;
-	DN_SK(newsk)->addrrem      = cb->src_port;
-	DN_SK(newsk)->services_rem = cb->services;
-	DN_SK(newsk)->info_rem     = cb->info;
-	DN_SK(newsk)->segsize_rem  = cb->segsize;
-	DN_SK(newsk)->accept_mode  = DN_SK(sk)->accept_mode;
-
-	if (DN_SK(newsk)->segsize_rem < 230)
-		DN_SK(newsk)->segsize_rem = 230;
-
-	if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
-		DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd;
-
-	newsk->sk_state  = TCP_LISTEN;
-	memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn));
-
-	/*
-	 * If we are listening on a wild socket, we don't want
-	 * the newly created socket on the wrong hash queue.
-	 */
-	DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD;
-
-	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
-	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
-	*(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
-	*(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
-
-	menuver = *skb->data;
-	skb_pull(skb, 1);
-
-	if (menuver & DN_MENUVER_ACC)
-		dn_access_copy(skb, &(DN_SK(newsk)->accessdata));
-
-	if (menuver & DN_MENUVER_USR)
-		dn_user_copy(skb, &(DN_SK(newsk)->conndata_in));
-
-	if (menuver & DN_MENUVER_PRX)
-		DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY;
-
-	if (menuver & DN_MENUVER_UIC)
-		DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY;
-
-	kfree_skb(skb);
-
-	memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out),
-		sizeof(struct optdata_dn));
-	memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out),
-		sizeof(struct optdata_dn));
-
-	lock_sock(newsk);
-	err = dn_hash_sock(newsk);
-	if (err == 0) {
-		sock_reset_flag(newsk, SOCK_ZAPPED);
-		dn_send_conn_ack(newsk);
-
-		/*
-		 * Here we use sk->sk_allocation since although the conn conf is
-		 * for the newsk, the context is the old socket.
-		 */
-		if (DN_SK(newsk)->accept_mode == ACC_IMMED)
-			err = dn_confirm_accept(newsk, &timeo,
-						sk->sk_allocation);
-	}
-	release_sock(newsk);
-	return err;
-}
-
-
-static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
-{
-	struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-
-	lock_sock(sk);
-
-	if (peer) {
-		if ((sock->state != SS_CONNECTED &&
-		     sock->state != SS_CONNECTING) &&
-		    scp->accept_mode == ACC_IMMED) {
-			release_sock(sk);
-			return -ENOTCONN;
-		}
-
-		memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn));
-	} else {
-		memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn));
-	}
-
-	release_sock(sk);
-
-	return sizeof(struct sockaddr_dn);
-}
-
-
-static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	__poll_t mask = datagram_poll(file, sock, wait);
-
-	if (!skb_queue_empty_lockless(&scp->other_receive_queue))
-		mask |= EPOLLRDBAND;
-
-	return mask;
-}
-
-static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	int err = -EOPNOTSUPP;
-	long amount = 0;
-	struct sk_buff *skb;
-	int val;
-
-	switch(cmd)
-	{
-	case SIOCGIFADDR:
-	case SIOCSIFADDR:
-		return dn_dev_ioctl(cmd, (void __user *)arg);
-
-	case SIOCATMARK:
-		lock_sock(sk);
-		val = !skb_queue_empty(&scp->other_receive_queue);
-		if (scp->state != DN_RUN)
-			val = -ENOTCONN;
-		release_sock(sk);
-		return val;
-
-	case TIOCOUTQ:
-		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
-		if (amount < 0)
-			amount = 0;
-		err = put_user(amount, (int __user *)arg);
-		break;
-
-	case TIOCINQ:
-		lock_sock(sk);
-		skb = skb_peek(&scp->other_receive_queue);
-		if (skb) {
-			amount = skb->len;
-		} else {
-			skb_queue_walk(&sk->sk_receive_queue, skb)
-				amount += skb->len;
-		}
-		release_sock(sk);
-		err = put_user(amount, (int __user *)arg);
-		break;
-
-	default:
-		err = -ENOIOCTLCMD;
-		break;
-	}
-
-	return err;
-}
-
-static int dn_listen(struct socket *sock, int backlog)
-{
-	struct sock *sk = sock->sk;
-	int err = -EINVAL;
-
-	lock_sock(sk);
-
-	if (sock_flag(sk, SOCK_ZAPPED))
-		goto out;
-
-	if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN))
-		goto out;
-
-	sk->sk_max_ack_backlog = backlog;
-	sk->sk_ack_backlog     = 0;
-	sk->sk_state           = TCP_LISTEN;
-	err                 = 0;
-	dn_rehash_sock(sk);
-
-out:
-	release_sock(sk);
-
-	return err;
-}
-
-
-static int dn_shutdown(struct socket *sock, int how)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	int err = -ENOTCONN;
-
-	lock_sock(sk);
-
-	if (sock->state == SS_UNCONNECTED)
-		goto out;
-
-	err = 0;
-	if (sock->state == SS_DISCONNECTING)
-		goto out;
-
-	err = -EINVAL;
-	if (scp->state == DN_O)
-		goto out;
-
-	if (how != SHUT_RDWR)
-		goto out;
-
-	sk->sk_shutdown = SHUTDOWN_MASK;
-	dn_destroy_sock(sk);
-	err = 0;
-
-out:
-	release_sock(sk);
-
-	return err;
-}
-
-static int dn_setsockopt(struct socket *sock, int level, int optname,
-		sockptr_t optval, unsigned int optlen)
-{
-	struct sock *sk = sock->sk;
-	int err;
-
-	lock_sock(sk);
-	err = __dn_setsockopt(sock, level, optname, optval, optlen, 0);
-	release_sock(sk);
-#ifdef CONFIG_NETFILTER
-	/* we need to exclude all possible ENOPROTOOPTs except default case */
-	if (err == -ENOPROTOOPT && optname != DSO_LINKINFO &&
-	    optname != DSO_STREAM && optname != DSO_SEQPACKET)
-		err = nf_setsockopt(sk, PF_DECnet, optname, optval, optlen);
-#endif
-
-	return err;
-}
-
-static int __dn_setsockopt(struct socket *sock, int level, int optname,
-		sockptr_t optval, unsigned int optlen, int flags)
-{
-	struct	sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	long timeo;
-	union {
-		struct optdata_dn opt;
-		struct accessdata_dn acc;
-		int mode;
-		unsigned long win;
-		int val;
-		unsigned char services;
-		unsigned char info;
-	} u;
-	int err;
-
-	if (optlen && sockptr_is_null(optval))
-		return -EINVAL;
-
-	if (optlen > sizeof(u))
-		return -EINVAL;
-
-	if (copy_from_sockptr(&u, optval, optlen))
-		return -EFAULT;
-
-	switch (optname) {
-	case DSO_CONDATA:
-		if (sock->state == SS_CONNECTED)
-			return -EISCONN;
-		if ((scp->state != DN_O) && (scp->state != DN_CR))
-			return -EINVAL;
-
-		if (optlen != sizeof(struct optdata_dn))
-			return -EINVAL;
-
-		if (le16_to_cpu(u.opt.opt_optl) > 16)
-			return -EINVAL;
-
-		memcpy(&scp->conndata_out, &u.opt, optlen);
-		break;
-
-	case DSO_DISDATA:
-		if (sock->state != SS_CONNECTED &&
-		    scp->accept_mode == ACC_IMMED)
-			return -ENOTCONN;
-
-		if (optlen != sizeof(struct optdata_dn))
-			return -EINVAL;
-
-		if (le16_to_cpu(u.opt.opt_optl) > 16)
-			return -EINVAL;
-
-		memcpy(&scp->discdata_out, &u.opt, optlen);
-		break;
-
-	case DSO_CONACCESS:
-		if (sock->state == SS_CONNECTED)
-			return -EISCONN;
-		if (scp->state != DN_O)
-			return -EINVAL;
-
-		if (optlen != sizeof(struct accessdata_dn))
-			return -EINVAL;
-
-		if ((u.acc.acc_accl > DN_MAXACCL) ||
-		    (u.acc.acc_passl > DN_MAXACCL) ||
-		    (u.acc.acc_userl > DN_MAXACCL))
-			return -EINVAL;
-
-		memcpy(&scp->accessdata, &u.acc, optlen);
-		break;
-
-	case DSO_ACCEPTMODE:
-		if (sock->state == SS_CONNECTED)
-			return -EISCONN;
-		if (scp->state != DN_O)
-			return -EINVAL;
-
-		if (optlen != sizeof(int))
-			return -EINVAL;
-
-		if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER))
-			return -EINVAL;
-
-		scp->accept_mode = (unsigned char)u.mode;
-		break;
-
-	case DSO_CONACCEPT:
-		if (scp->state != DN_CR)
-			return -EINVAL;
-		timeo = sock_rcvtimeo(sk, 0);
-		err = dn_confirm_accept(sk, &timeo, sk->sk_allocation);
-		return err;
-
-	case DSO_CONREJECT:
-		if (scp->state != DN_CR)
-			return -EINVAL;
-
-		scp->state = DN_DR;
-		sk->sk_shutdown = SHUTDOWN_MASK;
-		dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation);
-		break;
-
-	case DSO_MAXWINDOW:
-		if (optlen != sizeof(unsigned long))
-			return -EINVAL;
-		if (u.win > NSP_MAX_WINDOW)
-			u.win = NSP_MAX_WINDOW;
-		if (u.win == 0)
-			return -EINVAL;
-		scp->max_window = u.win;
-		if (scp->snd_window > u.win)
-			scp->snd_window = u.win;
-		break;
-
-	case DSO_NODELAY:
-		if (optlen != sizeof(int))
-			return -EINVAL;
-		if (scp->nonagle == TCP_NAGLE_CORK)
-			return -EINVAL;
-		scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_OFF;
-		/* if (scp->nonagle == 1) { Push pending frames } */
-		break;
-
-	case DSO_CORK:
-		if (optlen != sizeof(int))
-			return -EINVAL;
-		if (scp->nonagle == TCP_NAGLE_OFF)
-			return -EINVAL;
-		scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_CORK;
-		/* if (scp->nonagle == 0) { Push pending frames } */
-		break;
-
-	case DSO_SERVICES:
-		if (optlen != sizeof(unsigned char))
-			return -EINVAL;
-		if ((u.services & ~NSP_FC_MASK) != 0x01)
-			return -EINVAL;
-		if ((u.services & NSP_FC_MASK) == NSP_FC_MASK)
-			return -EINVAL;
-		scp->services_loc = u.services;
-		break;
-
-	case DSO_INFO:
-		if (optlen != sizeof(unsigned char))
-			return -EINVAL;
-		if (u.info & 0xfc)
-			return -EINVAL;
-		scp->info_loc = u.info;
-		break;
-
-	case DSO_LINKINFO:
-	case DSO_STREAM:
-	case DSO_SEQPACKET:
-	default:
-		return -ENOPROTOOPT;
-	}
-
-	return 0;
-}
-
-static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
-{
-	struct sock *sk = sock->sk;
-	int err;
-
-	lock_sock(sk);
-	err = __dn_getsockopt(sock, level, optname, optval, optlen, 0);
-	release_sock(sk);
-#ifdef CONFIG_NETFILTER
-	if (err == -ENOPROTOOPT && optname != DSO_STREAM &&
-	    optname != DSO_SEQPACKET && optname != DSO_CONACCEPT &&
-	    optname != DSO_CONREJECT) {
-		int len;
-
-		if (get_user(len, optlen))
-			return -EFAULT;
-
-		err = nf_getsockopt(sk, PF_DECnet, optname, optval, &len);
-		if (err >= 0)
-			err = put_user(len, optlen);
-	}
-#endif
-
-	return err;
-}
-
-static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags)
-{
-	struct	sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	struct linkinfo_dn link;
-	unsigned int r_len;
-	void *r_data = NULL;
-	unsigned int val;
-
-	if(get_user(r_len , optlen))
-		return -EFAULT;
-
-	switch (optname) {
-	case DSO_CONDATA:
-		if (r_len > sizeof(struct optdata_dn))
-			r_len = sizeof(struct optdata_dn);
-		r_data = &scp->conndata_in;
-		break;
-
-	case DSO_DISDATA:
-		if (r_len > sizeof(struct optdata_dn))
-			r_len = sizeof(struct optdata_dn);
-		r_data = &scp->discdata_in;
-		break;
-
-	case DSO_CONACCESS:
-		if (r_len > sizeof(struct accessdata_dn))
-			r_len = sizeof(struct accessdata_dn);
-		r_data = &scp->accessdata;
-		break;
-
-	case DSO_ACCEPTMODE:
-		if (r_len > sizeof(unsigned char))
-			r_len = sizeof(unsigned char);
-		r_data = &scp->accept_mode;
-		break;
-
-	case DSO_LINKINFO:
-		if (r_len > sizeof(struct linkinfo_dn))
-			r_len = sizeof(struct linkinfo_dn);
-
-		memset(&link, 0, sizeof(link));
-
-		switch (sock->state) {
-		case SS_CONNECTING:
-			link.idn_linkstate = LL_CONNECTING;
-			break;
-		case SS_DISCONNECTING:
-			link.idn_linkstate = LL_DISCONNECTING;
-			break;
-		case SS_CONNECTED:
-			link.idn_linkstate = LL_RUNNING;
-			break;
-		default:
-			link.idn_linkstate = LL_INACTIVE;
-		}
-
-		link.idn_segsize = scp->segsize_rem;
-		r_data = &link;
-		break;
-
-	case DSO_MAXWINDOW:
-		if (r_len > sizeof(unsigned long))
-			r_len = sizeof(unsigned long);
-		r_data = &scp->max_window;
-		break;
-
-	case DSO_NODELAY:
-		if (r_len > sizeof(int))
-			r_len = sizeof(int);
-		val = (scp->nonagle == TCP_NAGLE_OFF);
-		r_data = &val;
-		break;
-
-	case DSO_CORK:
-		if (r_len > sizeof(int))
-			r_len = sizeof(int);
-		val = (scp->nonagle == TCP_NAGLE_CORK);
-		r_data = &val;
-		break;
-
-	case DSO_SERVICES:
-		if (r_len > sizeof(unsigned char))
-			r_len = sizeof(unsigned char);
-		r_data = &scp->services_rem;
-		break;
-
-	case DSO_INFO:
-		if (r_len > sizeof(unsigned char))
-			r_len = sizeof(unsigned char);
-		r_data = &scp->info_rem;
-		break;
-
-	case DSO_STREAM:
-	case DSO_SEQPACKET:
-	case DSO_CONACCEPT:
-	case DSO_CONREJECT:
-	default:
-		return -ENOPROTOOPT;
-	}
-
-	if (r_data) {
-		if (copy_to_user(optval, r_data, r_len))
-			return -EFAULT;
-		if (put_user(r_len, optlen))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-
-static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target)
-{
-	struct sk_buff *skb;
-	int len = 0;
-
-	if (flags & MSG_OOB)
-		return !skb_queue_empty(q) ? 1 : 0;
-
-	skb_queue_walk(q, skb) {
-		struct dn_skb_cb *cb = DN_SKB_CB(skb);
-		len += skb->len;
-
-		if (cb->nsp_flags & 0x40) {
-			/* SOCK_SEQPACKET reads to EOM */
-			if (sk->sk_type == SOCK_SEQPACKET)
-				return 1;
-			/* so does SOCK_STREAM unless WAITALL is specified */
-			if (!(flags & MSG_WAITALL))
-				return 1;
-		}
-
-		/* minimum data length for read exceeded */
-		if (len >= target)
-			return 1;
-	}
-
-	return 0;
-}
-
-
-static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-		      int flags)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff_head *queue = &sk->sk_receive_queue;
-	size_t target = size > 1 ? 1 : 0;
-	size_t copied = 0;
-	int rv = 0;
-	struct sk_buff *skb, *n;
-	struct dn_skb_cb *cb = NULL;
-	unsigned char eor = 0;
-	long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
-	lock_sock(sk);
-
-	if (sock_flag(sk, SOCK_ZAPPED)) {
-		rv = -EADDRNOTAVAIL;
-		goto out;
-	}
-
-	if (sk->sk_shutdown & RCV_SHUTDOWN) {
-		rv = 0;
-		goto out;
-	}
-
-	rv = dn_check_state(sk, NULL, 0, &timeo, flags);
-	if (rv)
-		goto out;
-
-	if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
-		rv = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (flags & MSG_OOB)
-		queue = &scp->other_receive_queue;
-
-	if (flags & MSG_WAITALL)
-		target = size;
-
-
-	/*
-	 * See if there is data ready to read, sleep if there isn't
-	 */
-	for(;;) {
-		DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-		if (sk->sk_err)
-			goto out;
-
-		if (!skb_queue_empty(&scp->other_receive_queue)) {
-			if (!(flags & MSG_OOB)) {
-				msg->msg_flags |= MSG_OOB;
-				if (!scp->other_report) {
-					scp->other_report = 1;
-					goto out;
-				}
-			}
-		}
-
-		if (scp->state != DN_RUN)
-			goto out;
-
-		if (signal_pending(current)) {
-			rv = sock_intr_errno(timeo);
-			goto out;
-		}
-
-		if (dn_data_ready(sk, queue, flags, target))
-			break;
-
-		if (flags & MSG_DONTWAIT) {
-			rv = -EWOULDBLOCK;
-			goto out;
-		}
-
-		add_wait_queue(sk_sleep(sk), &wait);
-		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-		sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait);
-		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-		remove_wait_queue(sk_sleep(sk), &wait);
-	}
-
-	skb_queue_walk_safe(queue, skb, n) {
-		unsigned int chunk = skb->len;
-		cb = DN_SKB_CB(skb);
-
-		if ((chunk + copied) > size)
-			chunk = size - copied;
-
-		if (memcpy_to_msg(msg, skb->data, chunk)) {
-			rv = -EFAULT;
-			break;
-		}
-		copied += chunk;
-
-		if (!(flags & MSG_PEEK))
-			skb_pull(skb, chunk);
-
-		eor = cb->nsp_flags & 0x40;
-
-		if (skb->len == 0) {
-			skb_unlink(skb, queue);
-			kfree_skb(skb);
-			/*
-			 * N.B. Don't refer to skb or cb after this point
-			 * in loop.
-			 */
-			if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) {
-				scp->flowloc_sw = DN_SEND;
-				dn_nsp_send_link(sk, DN_SEND, 0);
-			}
-		}
-
-		if (eor) {
-			if (sk->sk_type == SOCK_SEQPACKET)
-				break;
-			if (!(flags & MSG_WAITALL))
-				break;
-		}
-
-		if (flags & MSG_OOB)
-			break;
-
-		if (copied >= target)
-			break;
-	}
-
-	rv = copied;
-
-
-	if (eor && (sk->sk_type == SOCK_SEQPACKET))
-		msg->msg_flags |= MSG_EOR;
-
-out:
-	if (rv == 0)
-		rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk);
-
-	if ((rv >= 0) && msg->msg_name) {
-		__sockaddr_check_size(sizeof(struct sockaddr_dn));
-		memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn));
-		msg->msg_namelen = sizeof(struct sockaddr_dn);
-	}
-
-	release_sock(sk);
-
-	return rv;
-}
-
-
-static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags)
-{
-	unsigned char fctype = scp->services_rem & NSP_FC_MASK;
-	if (skb_queue_len(queue) >= scp->snd_window)
-		return 1;
-	if (fctype != NSP_FC_NONE) {
-		if (flags & MSG_OOB) {
-			if (scp->flowrem_oth == 0)
-				return 1;
-		} else {
-			if (scp->flowrem_dat == 0)
-				return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * The DECnet spec requires that the "routing layer" accepts packets which
- * are at least 230 bytes in size. This excludes any headers which the NSP
- * layer might add, so we always assume that we'll be using the maximal
- * length header on data packets. The variation in length is due to the
- * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't
- * make much practical difference.
- */
-unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu)
-{
-	unsigned int mss = 230 - DN_MAX_NSP_DATA_HEADER;
-	if (dev) {
-		struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-		mtu -= LL_RESERVED_SPACE(dev);
-		if (dn_db->use_long)
-			mtu -= 21;
-		else
-			mtu -= 6;
-		mtu -= DN_MAX_NSP_DATA_HEADER;
-	} else {
-		/*
-		 * 21 = long header, 16 = guess at MAC header length
-		 */
-		mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16);
-	}
-	if (mtu > mss)
-		mss = mtu;
-	return mss;
-}
-
-static inline unsigned int dn_current_mss(struct sock *sk, int flags)
-{
-	struct dst_entry *dst = __sk_dst_get(sk);
-	struct dn_scp *scp = DN_SK(sk);
-	int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem);
-
-	/* Other data messages are limited to 16 bytes per packet */
-	if (flags & MSG_OOB)
-		return 16;
-
-	/* This works out the maximum size of segment we can send out */
-	if (dst) {
-		u32 mtu = dst_mtu(dst);
-		mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
-	}
-
-	return mss_now;
-}
-
-/*
- * N.B. We get the timeout wrong here, but then we always did get it
- * wrong before and this is another step along the road to correcting
- * it. It ought to get updated each time we pass through the routine,
- * but in practise it probably doesn't matter too much for now.
- */
-static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
-			      unsigned long datalen, int noblock,
-			      int *errcode)
-{
-	struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
-						   noblock, errcode);
-	if (skb) {
-		skb->protocol = htons(ETH_P_DNA_RT);
-		skb->pkt_type = PACKET_OUTGOING;
-	}
-	return skb;
-}
-
-static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
-{
-	struct sock *sk = sock->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	size_t mss;
-	struct sk_buff_head *queue = &scp->data_xmit_queue;
-	int flags = msg->msg_flags;
-	int err = 0;
-	size_t sent = 0;
-	int addr_len = msg->msg_namelen;
-	DECLARE_SOCKADDR(struct sockaddr_dn *, addr, msg->msg_name);
-	struct sk_buff *skb = NULL;
-	struct dn_skb_cb *cb;
-	size_t len;
-	unsigned char fctype;
-	long timeo;
-
-	if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
-		return -EOPNOTSUPP;
-
-	if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
-		return -EINVAL;
-
-	lock_sock(sk);
-	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-	/*
-	 * The only difference between stream sockets and sequenced packet
-	 * sockets is that the stream sockets always behave as if MSG_EOR
-	 * has been set.
-	 */
-	if (sock->type == SOCK_STREAM) {
-		if (flags & MSG_EOR) {
-			err = -EINVAL;
-			goto out;
-		}
-		flags |= MSG_EOR;
-	}
-
-
-	err = dn_check_state(sk, addr, addr_len, &timeo, flags);
-	if (err)
-		goto out_err;
-
-	if (sk->sk_shutdown & SEND_SHUTDOWN) {
-		err = -EPIPE;
-		if (!(flags & MSG_NOSIGNAL))
-			send_sig(SIGPIPE, current, 0);
-		goto out_err;
-	}
-
-	if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
-		dst_negative_advice(sk);
-
-	mss = scp->segsize_rem;
-	fctype = scp->services_rem & NSP_FC_MASK;
-
-	mss = dn_current_mss(sk, flags);
-
-	if (flags & MSG_OOB) {
-		queue = &scp->other_xmit_queue;
-		if (size > mss) {
-			err = -EMSGSIZE;
-			goto out;
-		}
-	}
-
-	scp->persist_fxn = dn_nsp_xmit_timeout;
-
-	while(sent < size) {
-		err = sock_error(sk);
-		if (err)
-			goto out;
-
-		if (signal_pending(current)) {
-			err = sock_intr_errno(timeo);
-			goto out;
-		}
-
-		/*
-		 * Calculate size that we wish to send.
-		 */
-		len = size - sent;
-
-		if (len > mss)
-			len = mss;
-
-		/*
-		 * Wait for queue size to go down below the window
-		 * size.
-		 */
-		if (dn_queue_too_long(scp, queue, flags)) {
-			DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-			if (flags & MSG_DONTWAIT) {
-				err = -EWOULDBLOCK;
-				goto out;
-			}
-
-			add_wait_queue(sk_sleep(sk), &wait);
-			sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-			sk_wait_event(sk, &timeo,
-				      !dn_queue_too_long(scp, queue, flags), &wait);
-			sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-			remove_wait_queue(sk_sleep(sk), &wait);
-			continue;
-		}
-
-		/*
-		 * Get a suitably sized skb.
-		 * 64 is a bit of a hack really, but its larger than any
-		 * link-layer headers and has served us well as a good
-		 * guess as to their real length.
-		 */
-		skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
-					 flags & MSG_DONTWAIT, &err);
-
-		if (err)
-			break;
-
-		if (!skb)
-			continue;
-
-		cb = DN_SKB_CB(skb);
-
-		skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
-
-		if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
-			err = -EFAULT;
-			goto out;
-		}
-
-		if (flags & MSG_OOB) {
-			cb->nsp_flags = 0x30;
-			if (fctype != NSP_FC_NONE)
-				scp->flowrem_oth--;
-		} else {
-			cb->nsp_flags = 0x00;
-			if (scp->seg_total == 0)
-				cb->nsp_flags |= 0x20;
-
-			scp->seg_total += len;
-
-			if (((sent + len) == size) && (flags & MSG_EOR)) {
-				cb->nsp_flags |= 0x40;
-				scp->seg_total = 0;
-				if (fctype == NSP_FC_SCMC)
-					scp->flowrem_dat--;
-			}
-			if (fctype == NSP_FC_SRC)
-				scp->flowrem_dat--;
-		}
-
-		sent += len;
-		dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB);
-		skb = NULL;
-
-		scp->persist = dn_nsp_persist(sk);
-
-	}
-out:
-
-	kfree_skb(skb);
-
-	release_sock(sk);
-
-	return sent ? sent : err;
-
-out_err:
-	err = sk_stream_error(sk, flags, err);
-	release_sock(sk);
-	return err;
-}
-
-static int dn_device_event(struct notifier_block *this, unsigned long event,
-			   void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_UP:
-		dn_dev_up(dev);
-		break;
-	case NETDEV_DOWN:
-		dn_dev_down(dev);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block dn_dev_notifier = {
-	.notifier_call = dn_device_event,
-};
-
-static struct packet_type dn_dix_packet_type __read_mostly = {
-	.type =		cpu_to_be16(ETH_P_DNA_RT),
-	.func =		dn_route_rcv,
-};
-
-#ifdef CONFIG_PROC_FS
-struct dn_iter_state {
-	int bucket;
-};
-
-static struct sock *dn_socket_get_first(struct seq_file *seq)
-{
-	struct dn_iter_state *state = seq->private;
-	struct sock *n = NULL;
-
-	for(state->bucket = 0;
-	    state->bucket < DN_SK_HASH_SIZE;
-	    ++state->bucket) {
-		n = sk_head(&dn_sk_hash[state->bucket]);
-		if (n)
-			break;
-	}
-
-	return n;
-}
-
-static struct sock *dn_socket_get_next(struct seq_file *seq,
-				       struct sock *n)
-{
-	struct dn_iter_state *state = seq->private;
-
-	n = sk_next(n);
-	while (!n) {
-		if (++state->bucket >= DN_SK_HASH_SIZE)
-			break;
-		n = sk_head(&dn_sk_hash[state->bucket]);
-	}
-	return n;
-}
-
-static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos)
-{
-	struct sock *sk = dn_socket_get_first(seq);
-
-	if (sk) {
-		while(*pos && (sk = dn_socket_get_next(seq, sk)))
-			--*pos;
-	}
-	return *pos ? NULL : sk;
-}
-
-static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos)
-{
-	void *rc;
-	read_lock_bh(&dn_hash_lock);
-	rc = socket_get_idx(seq, &pos);
-	if (!rc) {
-		read_unlock_bh(&dn_hash_lock);
-	}
-	return rc;
-}
-
-static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	void *rc;
-
-	if (v == SEQ_START_TOKEN) {
-		rc = dn_socket_get_idx(seq, 0);
-		goto out;
-	}
-
-	rc = dn_socket_get_next(seq, v);
-	if (rc)
-		goto out;
-	read_unlock_bh(&dn_hash_lock);
-out:
-	++*pos;
-	return rc;
-}
-
-static void dn_socket_seq_stop(struct seq_file *seq, void *v)
-{
-	if (v && v != SEQ_START_TOKEN)
-		read_unlock_bh(&dn_hash_lock);
-}
-
-#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126)
-
-static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf)
-{
-	int i;
-
-	switch (le16_to_cpu(dn->sdn_objnamel)) {
-	case 0:
-		sprintf(buf, "%d", dn->sdn_objnum);
-		break;
-	default:
-		for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) {
-			buf[i] = dn->sdn_objname[i];
-			if (IS_NOT_PRINTABLE(buf[i]))
-				buf[i] = '.';
-		}
-		buf[i] = 0;
-	}
-}
-
-static char *dn_state2asc(unsigned char state)
-{
-	switch (state) {
-	case DN_O:
-		return "OPEN";
-	case DN_CR:
-		return "  CR";
-	case DN_DR:
-		return "  DR";
-	case DN_DRC:
-		return " DRC";
-	case DN_CC:
-		return "  CC";
-	case DN_CI:
-		return "  CI";
-	case DN_NR:
-		return "  NR";
-	case DN_NC:
-		return "  NC";
-	case DN_CD:
-		return "  CD";
-	case DN_RJ:
-		return "  RJ";
-	case DN_RUN:
-		return " RUN";
-	case DN_DI:
-		return "  DI";
-	case DN_DIC:
-		return " DIC";
-	case DN_DN:
-		return "  DN";
-	case DN_CL:
-		return "  CL";
-	case DN_CN:
-		return "  CN";
-	}
-
-	return "????";
-}
-
-static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	char buf1[DN_ASCBUF_LEN];
-	char buf2[DN_ASCBUF_LEN];
-	char local_object[DN_MAXOBJL+3];
-	char remote_object[DN_MAXOBJL+3];
-
-	dn_printable_object(&scp->addr, local_object);
-	dn_printable_object(&scp->peer, remote_object);
-
-	seq_printf(seq,
-		   "%6s/%04X %04d:%04d %04d:%04d %01d %-16s "
-		   "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n",
-		   dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1),
-		   scp->addrloc,
-		   scp->numdat,
-		   scp->numoth,
-		   scp->ackxmt_dat,
-		   scp->ackxmt_oth,
-		   scp->flowloc_sw,
-		   local_object,
-		   dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2),
-		   scp->addrrem,
-		   scp->numdat_rcv,
-		   scp->numoth_rcv,
-		   scp->ackrcv_dat,
-		   scp->ackrcv_oth,
-		   scp->flowrem_sw,
-		   remote_object,
-		   dn_state2asc(scp->state),
-		   ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER"));
-}
-
-static int dn_socket_seq_show(struct seq_file *seq, void *v)
-{
-	if (v == SEQ_START_TOKEN) {
-		seq_puts(seq, "Local                                              Remote\n");
-	} else {
-		dn_socket_format_entry(seq, v);
-	}
-	return 0;
-}
-
-static const struct seq_operations dn_socket_seq_ops = {
-	.start	= dn_socket_seq_start,
-	.next	= dn_socket_seq_next,
-	.stop	= dn_socket_seq_stop,
-	.show	= dn_socket_seq_show,
-};
-#endif
-
-static const struct net_proto_family	dn_family_ops = {
-	.family =	AF_DECnet,
-	.create =	dn_create,
-	.owner	=	THIS_MODULE,
-};
-
-static const struct proto_ops dn_proto_ops = {
-	.family =	AF_DECnet,
-	.owner =	THIS_MODULE,
-	.release =	dn_release,
-	.bind =		dn_bind,
-	.connect =	dn_connect,
-	.socketpair =	sock_no_socketpair,
-	.accept =	dn_accept,
-	.getname =	dn_getname,
-	.poll =		dn_poll,
-	.ioctl =	dn_ioctl,
-	.listen =	dn_listen,
-	.shutdown =	dn_shutdown,
-	.setsockopt =	dn_setsockopt,
-	.getsockopt =	dn_getsockopt,
-	.sendmsg =	dn_sendmsg,
-	.recvmsg =	dn_recvmsg,
-	.mmap =		sock_no_mmap,
-	.sendpage =	sock_no_sendpage,
-};
-
-MODULE_DESCRIPTION("The Linux DECnet Network Protocol");
-MODULE_AUTHOR("Linux DECnet Project Team");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_DECnet);
-
-static const char banner[] __initconst = KERN_INFO
-"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
-
-static int __init decnet_init(void)
-{
-	int rc;
-
-	printk(banner);
-
-	rc = proto_register(&dn_proto, 1);
-	if (rc != 0)
-		goto out;
-
-	dn_neigh_init();
-	dn_dev_init();
-	dn_route_init();
-	dn_fib_init();
-
-	sock_register(&dn_family_ops);
-	dev_add_pack(&dn_dix_packet_type);
-	register_netdevice_notifier(&dn_dev_notifier);
-
-	proc_create_seq_private("decnet", 0444, init_net.proc_net,
-			&dn_socket_seq_ops, sizeof(struct dn_iter_state),
-			NULL);
-	dn_register_sysctl();
-out:
-	return rc;
-
-}
-module_init(decnet_init);
-
-/*
- * Prevent DECnet module unloading until its fixed properly.
- * Requires an audit of the code to check for memory leaks and
- * initialisation problems etc.
- */
-#if 0
-static void __exit decnet_exit(void)
-{
-	sock_unregister(AF_DECnet);
-	rtnl_unregister_all(PF_DECnet);
-	dev_remove_pack(&dn_dix_packet_type);
-
-	dn_unregister_sysctl();
-
-	unregister_netdevice_notifier(&dn_dev_notifier);
-
-	dn_route_cleanup();
-	dn_dev_cleanup();
-	dn_neigh_cleanup();
-	dn_fib_cleanup();
-
-	remove_proc_entry("decnet", init_net.proc_net);
-
-	proto_unregister(&dn_proto);
-
-	rcu_barrier(); /* Wait for completion of call_rcu()'s */
-}
-module_exit(decnet_exit);
-#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
deleted file mode 100644
index a09ba642b5e7..000000000000
--- a/net/decnet/dn_dev.c
+++ /dev/null
@@ -1,1433 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Device Layer
- *
- * Authors:     Steve Whitehouse <SteveW@ACM.org>
- *              Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *          Steve Whitehouse : Devices now see incoming frames so they
- *                             can mark on who it came from.
- *          Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour
- *                             can now have a device specific setup func.
- *          Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/
- *          Steve Whitehouse : Fixed bug which sometimes killed timer
- *          Steve Whitehouse : Multiple ifaddr support
- *          Steve Whitehouse : SIOCGIFCONF is now a compile time option
- *          Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding
- *          Steve Whitehouse : Removed timer1 - it's a user space issue now
- *         Patrick Caulfield : Fixed router hello message format
- *          Steve Whitehouse : Got rid of constant sizes for blksize for
- *                             devices. All mtu based now.
- */
-
-#include <linux/capability.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/if_addr.h>
-#include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/skbuff.h>
-#include <linux/sysctl.h>
-#include <linux/notifier.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/uaccess.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/netlink.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn))
-
-static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
-static char dn_rt_all_rt_mcast[ETH_ALEN]  = {0xAB,0x00,0x00,0x03,0x00,0x00};
-static char dn_hiord[ETH_ALEN]            = {0xAA,0x00,0x04,0x00,0x00,0x00};
-static unsigned char dn_eco_version[3]    = {0x02,0x00,0x00};
-
-extern struct neigh_table dn_neigh_table;
-
-/*
- * decnet_address is kept in network order.
- */
-__le16 decnet_address = 0;
-
-static DEFINE_SPINLOCK(dndev_lock);
-static struct net_device *decnet_default_device;
-static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
-
-static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
-static void dn_dev_delete(struct net_device *dev);
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa);
-
-static int dn_eth_up(struct net_device *);
-static void dn_eth_down(struct net_device *);
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-
-static struct dn_dev_parms dn_dev_list[] =  {
-{
-	.type =		ARPHRD_ETHER, /* Ethernet */
-	.mode =		DN_DEV_BCAST,
-	.state =	DN_DEV_S_RU,
-	.t2 =		1,
-	.t3 =		10,
-	.name =		"ethernet",
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
-},
-{
-	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
-	.mode =		DN_DEV_BCAST,
-	.state =	DN_DEV_S_RU,
-	.t2 =		1,
-	.t3 =		10,
-	.name =		"ipgre",
-	.timer3 =	dn_send_brd_hello,
-},
-#if 0
-{
-	.type =		ARPHRD_X25, /* Bog standard X.25 */
-	.mode =		DN_DEV_UCAST,
-	.state =	DN_DEV_S_DS,
-	.t2 =		1,
-	.t3 =		120,
-	.name =		"x25",
-	.timer3 =	dn_send_ptp_hello,
-},
-#endif
-#if 0
-{
-	.type =		ARPHRD_PPP, /* DECnet over PPP */
-	.mode =		DN_DEV_BCAST,
-	.state =	DN_DEV_S_RU,
-	.t2 =		1,
-	.t3 =		10,
-	.name =		"ppp",
-	.timer3 =	dn_send_brd_hello,
-},
-#endif
-{
-	.type =		ARPHRD_DDCMP, /* DECnet over DDCMP */
-	.mode =		DN_DEV_UCAST,
-	.state =	DN_DEV_S_DS,
-	.t2 =		1,
-	.t3 =		120,
-	.name =		"ddcmp",
-	.timer3 =	dn_send_ptp_hello,
-},
-{
-	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
-	.mode =		DN_DEV_BCAST,
-	.state =	DN_DEV_S_RU,
-	.t2 =		1,
-	.t3 =		10,
-	.name =		"loopback",
-	.timer3 =	dn_send_brd_hello,
-}
-};
-
-#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list)
-
-#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x)
-
-#ifdef CONFIG_SYSCTL
-
-static int min_t2[] = { 1 };
-static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */
-static int min_t3[] = { 1 };
-static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */
-
-static int min_priority[1];
-static int max_priority[] = { 127 }; /* From DECnet spec */
-
-static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *,
-		loff_t *);
-static struct dn_dev_sysctl_table {
-	struct ctl_table_header *sysctl_header;
-	struct ctl_table dn_dev_vars[5];
-} dn_dev_sysctl = {
-	NULL,
-	{
-	{
-		.procname = "forwarding",
-		.data = (void *)DN_DEV_PARMS_OFFSET(forwarding),
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = dn_forwarding_proc,
-	},
-	{
-		.procname = "priority",
-		.data = (void *)DN_DEV_PARMS_OFFSET(priority),
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_priority,
-		.extra2 = &max_priority
-	},
-	{
-		.procname = "t2",
-		.data = (void *)DN_DEV_PARMS_OFFSET(t2),
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_t2,
-		.extra2 = &max_t2
-	},
-	{
-		.procname = "t3",
-		.data = (void *)DN_DEV_PARMS_OFFSET(t3),
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_t3,
-		.extra2 = &max_t3
-	},
-	{ }
-	},
-};
-
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
-	struct dn_dev_sysctl_table *t;
-	int i;
-
-	char path[sizeof("net/decnet/conf/") + IFNAMSIZ];
-
-	t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL);
-	if (t == NULL)
-		return;
-
-	for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) {
-		long offset = (long)t->dn_dev_vars[i].data;
-		t->dn_dev_vars[i].data = ((char *)parms) + offset;
-	}
-
-	snprintf(path, sizeof(path), "net/decnet/conf/%s",
-		dev? dev->name : parms->name);
-
-	t->dn_dev_vars[0].extra1 = (void *)dev;
-
-	t->sysctl_header = register_net_sysctl(&init_net, path, t->dn_dev_vars);
-	if (t->sysctl_header == NULL)
-		kfree(t);
-	else
-		parms->sysctl = t;
-}
-
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
-	if (parms->sysctl) {
-		struct dn_dev_sysctl_table *t = parms->sysctl;
-		parms->sysctl = NULL;
-		unregister_net_sysctl_table(t->sysctl_header);
-		kfree(t);
-	}
-}
-
-static int dn_forwarding_proc(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-#ifdef CONFIG_DECNET_ROUTER
-	struct net_device *dev = table->extra1;
-	struct dn_dev *dn_db;
-	int err;
-	int tmp, old;
-
-	if (table->extra1 == NULL)
-		return -EINVAL;
-
-	dn_db = rcu_dereference_raw(dev->dn_ptr);
-	old = dn_db->parms.forwarding;
-
-	err = proc_dointvec(table, write, buffer, lenp, ppos);
-
-	if ((err >= 0) && write) {
-		if (dn_db->parms.forwarding < 0)
-			dn_db->parms.forwarding = 0;
-		if (dn_db->parms.forwarding > 2)
-			dn_db->parms.forwarding = 2;
-		/*
-		 * What an ugly hack this is... its works, just. It
-		 * would be nice if sysctl/proc were just that little
-		 * bit more flexible so I don't have to write a special
-		 * routine, or suffer hacks like this - SJW
-		 */
-		tmp = dn_db->parms.forwarding;
-		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
-		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
-	}
-
-	return err;
-#else
-	return -EINVAL;
-#endif
-}
-
-#else /* CONFIG_SYSCTL */
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
-}
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
-
-static inline __u16 mtu2blksize(struct net_device *dev)
-{
-	u32 blksize = dev->mtu;
-	if (blksize > 0xffff)
-		blksize = 0xffff;
-
-	if (dev->type == ARPHRD_ETHER ||
-	    dev->type == ARPHRD_PPP ||
-	    dev->type == ARPHRD_IPGRE ||
-	    dev->type == ARPHRD_LOOPBACK)
-		blksize -= 2;
-
-	return (__u16)blksize;
-}
-
-static struct dn_ifaddr *dn_dev_alloc_ifa(void)
-{
-	struct dn_ifaddr *ifa;
-
-	ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
-
-	return ifa;
-}
-
-static void dn_dev_free_ifa(struct dn_ifaddr *ifa)
-{
-	kfree_rcu(ifa, rcu);
-}
-
-static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy)
-{
-	struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap);
-	unsigned char mac_addr[6];
-	struct net_device *dev = dn_db->dev;
-
-	ASSERT_RTNL();
-
-	*ifap = ifa1->ifa_next;
-
-	if (dn_db->dev->type == ARPHRD_ETHER) {
-		if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
-			dn_dn2eth(mac_addr, ifa1->ifa_local);
-			dev_mc_del(dev, mac_addr);
-		}
-	}
-
-	dn_ifaddr_notify(RTM_DELADDR, ifa1);
-	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
-	if (destroy) {
-		dn_dev_free_ifa(ifa1);
-
-		if (dn_db->ifa_list == NULL)
-			dn_dev_delete(dn_db->dev);
-	}
-}
-
-static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
-	struct net_device *dev = dn_db->dev;
-	struct dn_ifaddr *ifa1;
-	unsigned char mac_addr[6];
-
-	ASSERT_RTNL();
-
-	/* Check for duplicates */
-	for (ifa1 = rtnl_dereference(dn_db->ifa_list);
-	     ifa1 != NULL;
-	     ifa1 = rtnl_dereference(ifa1->ifa_next)) {
-		if (ifa1->ifa_local == ifa->ifa_local)
-			return -EEXIST;
-	}
-
-	if (dev->type == ARPHRD_ETHER) {
-		if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
-			dn_dn2eth(mac_addr, ifa->ifa_local);
-			dev_mc_add(dev, mac_addr);
-		}
-	}
-
-	ifa->ifa_next = dn_db->ifa_list;
-	rcu_assign_pointer(dn_db->ifa_list, ifa);
-
-	dn_ifaddr_notify(RTM_NEWADDR, ifa);
-	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
-
-	return 0;
-}
-
-static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa)
-{
-	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-	int rv;
-
-	if (dn_db == NULL) {
-		int err;
-		dn_db = dn_dev_create(dev, &err);
-		if (dn_db == NULL)
-			return err;
-	}
-
-	ifa->ifa_dev = dn_db;
-
-	if (dev->flags & IFF_LOOPBACK)
-		ifa->ifa_scope = RT_SCOPE_HOST;
-
-	rv = dn_dev_insert_ifa(dn_db, ifa);
-	if (rv)
-		dn_dev_free_ifa(ifa);
-	return rv;
-}
-
-
-int dn_dev_ioctl(unsigned int cmd, void __user *arg)
-{
-	char buffer[DN_IFREQ_SIZE];
-	struct ifreq *ifr = (struct ifreq *)buffer;
-	struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;
-	struct dn_dev *dn_db;
-	struct net_device *dev;
-	struct dn_ifaddr *ifa = NULL;
-	struct dn_ifaddr __rcu **ifap = NULL;
-	int ret = 0;
-
-	if (copy_from_user(ifr, arg, DN_IFREQ_SIZE))
-		return -EFAULT;
-	ifr->ifr_name[IFNAMSIZ-1] = 0;
-
-	dev_load(&init_net, ifr->ifr_name);
-
-	switch (cmd) {
-	case SIOCGIFADDR:
-		break;
-	case SIOCSIFADDR:
-		if (!capable(CAP_NET_ADMIN))
-			return -EACCES;
-		if (sdn->sdn_family != AF_DECnet)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	rtnl_lock();
-
-	if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) {
-		ret = -ENODEV;
-		goto done;
-	}
-
-	if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) {
-		for (ifap = &dn_db->ifa_list;
-		     (ifa = rtnl_dereference(*ifap)) != NULL;
-		     ifap = &ifa->ifa_next)
-			if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0)
-				break;
-	}
-
-	if (ifa == NULL && cmd != SIOCSIFADDR) {
-		ret = -EADDRNOTAVAIL;
-		goto done;
-	}
-
-	switch (cmd) {
-	case SIOCGIFADDR:
-		*((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
-		if (copy_to_user(arg, ifr, DN_IFREQ_SIZE))
-			ret = -EFAULT;
-		break;
-
-	case SIOCSIFADDR:
-		if (!ifa) {
-			if ((ifa = dn_dev_alloc_ifa()) == NULL) {
-				ret = -ENOBUFS;
-				break;
-			}
-			memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
-		} else {
-			if (ifa->ifa_local == dn_saddr2dn(sdn))
-				break;
-			dn_dev_del_ifa(dn_db, ifap, 0);
-		}
-
-		ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn);
-
-		ret = dn_dev_set_ifa(dev, ifa);
-	}
-done:
-	rtnl_unlock();
-
-	return ret;
-}
-
-struct net_device *dn_dev_get_default(void)
-{
-	struct net_device *dev;
-
-	spin_lock(&dndev_lock);
-	dev = decnet_default_device;
-	if (dev) {
-		if (dev->dn_ptr)
-			dev_hold(dev);
-		else
-			dev = NULL;
-	}
-	spin_unlock(&dndev_lock);
-
-	return dev;
-}
-
-int dn_dev_set_default(struct net_device *dev, int force)
-{
-	struct net_device *old = NULL;
-	int rv = -EBUSY;
-	if (!dev->dn_ptr)
-		return -ENODEV;
-
-	spin_lock(&dndev_lock);
-	if (force || decnet_default_device == NULL) {
-		old = decnet_default_device;
-		decnet_default_device = dev;
-		rv = 0;
-	}
-	spin_unlock(&dndev_lock);
-
-	dev_put(old);
-	return rv;
-}
-
-static void dn_dev_check_default(struct net_device *dev)
-{
-	spin_lock(&dndev_lock);
-	if (dev == decnet_default_device) {
-		decnet_default_device = NULL;
-	} else {
-		dev = NULL;
-	}
-	spin_unlock(&dndev_lock);
-
-	dev_put(dev);
-}
-
-/*
- * Called with RTNL
- */
-static struct dn_dev *dn_dev_by_index(int ifindex)
-{
-	struct net_device *dev;
-	struct dn_dev *dn_dev = NULL;
-
-	dev = __dev_get_by_index(&init_net, ifindex);
-	if (dev)
-		dn_dev = rtnl_dereference(dev->dn_ptr);
-
-	return dn_dev;
-}
-
-static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
-	[IFA_ADDRESS]		= { .type = NLA_U16 },
-	[IFA_LOCAL]		= { .type = NLA_U16 },
-	[IFA_LABEL]		= { .type = NLA_STRING,
-				    .len = IFNAMSIZ - 1 },
-	[IFA_FLAGS]		= { .type = NLA_U32 },
-};
-
-static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
-			 struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(skb->sk);
-	struct nlattr *tb[IFA_MAX+1];
-	struct dn_dev *dn_db;
-	struct ifaddrmsg *ifm;
-	struct dn_ifaddr *ifa;
-	struct dn_ifaddr __rcu **ifap;
-	int err = -EINVAL;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!net_eq(net, &init_net))
-		goto errout;
-
-	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
-				     dn_ifa_policy, extack);
-	if (err < 0)
-		goto errout;
-
-	err = -ENODEV;
-	ifm = nlmsg_data(nlh);
-	if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
-		goto errout;
-
-	err = -EADDRNOTAVAIL;
-	for (ifap = &dn_db->ifa_list;
-	     (ifa = rtnl_dereference(*ifap)) != NULL;
-	     ifap = &ifa->ifa_next) {
-		if (tb[IFA_LOCAL] &&
-		    nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
-			continue;
-
-		if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
-			continue;
-
-		dn_dev_del_ifa(dn_db, ifap, 1);
-		return 0;
-	}
-
-errout:
-	return err;
-}
-
-static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
-			 struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(skb->sk);
-	struct nlattr *tb[IFA_MAX+1];
-	struct net_device *dev;
-	struct dn_dev *dn_db;
-	struct ifaddrmsg *ifm;
-	struct dn_ifaddr *ifa;
-	int err;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!net_eq(net, &init_net))
-		return -EINVAL;
-
-	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
-				     dn_ifa_policy, extack);
-	if (err < 0)
-		return err;
-
-	if (tb[IFA_LOCAL] == NULL)
-		return -EINVAL;
-
-	ifm = nlmsg_data(nlh);
-	if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL)
-		return -ENODEV;
-
-	if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) {
-		dn_db = dn_dev_create(dev, &err);
-		if (!dn_db)
-			return err;
-	}
-
-	if ((ifa = dn_dev_alloc_ifa()) == NULL)
-		return -ENOBUFS;
-
-	if (tb[IFA_ADDRESS] == NULL)
-		tb[IFA_ADDRESS] = tb[IFA_LOCAL];
-
-	ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]);
-	ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]);
-	ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
-					 ifm->ifa_flags;
-	ifa->ifa_scope = ifm->ifa_scope;
-	ifa->ifa_dev = dn_db;
-
-	if (tb[IFA_LABEL])
-		nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
-	else
-		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
-
-	err = dn_dev_insert_ifa(dn_db, ifa);
-	if (err)
-		dn_dev_free_ifa(ifa);
-
-	return err;
-}
-
-static inline size_t dn_ifaddr_nlmsg_size(void)
-{
-	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
-	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
-	       + nla_total_size(2) /* IFA_ADDRESS */
-	       + nla_total_size(2) /* IFA_LOCAL */
-	       + nla_total_size(4); /* IFA_FLAGS */
-}
-
-static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
-			     u32 portid, u32 seq, int event, unsigned int flags)
-{
-	struct ifaddrmsg *ifm;
-	struct nlmsghdr *nlh;
-	u32 ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT;
-
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
-	if (nlh == NULL)
-		return -EMSGSIZE;
-
-	ifm = nlmsg_data(nlh);
-	ifm->ifa_family = AF_DECnet;
-	ifm->ifa_prefixlen = 16;
-	ifm->ifa_flags = ifa_flags;
-	ifm->ifa_scope = ifa->ifa_scope;
-	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-
-	if ((ifa->ifa_address &&
-	     nla_put_le16(skb, IFA_ADDRESS, ifa->ifa_address)) ||
-	    (ifa->ifa_local &&
-	     nla_put_le16(skb, IFA_LOCAL, ifa->ifa_local)) ||
-	    (ifa->ifa_label[0] &&
-	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
-	     nla_put_u32(skb, IFA_FLAGS, ifa_flags))
-		goto nla_put_failure;
-	nlmsg_end(skb, nlh);
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
-{
-	struct sk_buff *skb;
-	int err = -ENOBUFS;
-
-	skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL);
-	if (skb == NULL)
-		goto errout;
-
-	err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0);
-	if (err < 0) {
-		/* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */
-		WARN_ON(err == -EMSGSIZE);
-		kfree_skb(skb);
-		goto errout;
-	}
-	rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
-	return;
-errout:
-	if (err < 0)
-		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
-}
-
-static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct net *net = sock_net(skb->sk);
-	int idx, dn_idx = 0, skip_ndevs, skip_naddr;
-	struct net_device *dev;
-	struct dn_dev *dn_db;
-	struct dn_ifaddr *ifa;
-
-	if (!net_eq(net, &init_net))
-		return 0;
-
-	skip_ndevs = cb->args[0];
-	skip_naddr = cb->args[1];
-
-	idx = 0;
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev) {
-		if (idx < skip_ndevs)
-			goto cont;
-		else if (idx > skip_ndevs) {
-			/* Only skip over addresses for first dev dumped
-			 * in this iteration (idx == skip_ndevs) */
-			skip_naddr = 0;
-		}
-
-		if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL)
-			goto cont;
-
-		for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa;
-		     ifa = rcu_dereference(ifa->ifa_next), dn_idx++) {
-			if (dn_idx < skip_naddr)
-				continue;
-
-			if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).portid,
-					      cb->nlh->nlmsg_seq, RTM_NEWADDR,
-					      NLM_F_MULTI) < 0)
-				goto done;
-		}
-cont:
-		idx++;
-	}
-done:
-	rcu_read_unlock();
-	cb->args[0] = idx;
-	cb->args[1] = dn_idx;
-
-	return skb->len;
-}
-
-static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
-{
-	struct dn_dev *dn_db;
-	struct dn_ifaddr *ifa;
-	int rv = -ENODEV;
-
-	rcu_read_lock();
-	dn_db = rcu_dereference(dev->dn_ptr);
-	if (dn_db == NULL)
-		goto out;
-
-	ifa = rcu_dereference(dn_db->ifa_list);
-	if (ifa != NULL) {
-		*addr = ifa->ifa_local;
-		rv = 0;
-	}
-out:
-	rcu_read_unlock();
-	return rv;
-}
-
-/*
- * Find a default address to bind to.
- *
- * This is one of those areas where the initial VMS concepts don't really
- * map onto the Linux concepts, and since we introduced multiple addresses
- * per interface we have to cope with slightly odd ways of finding out what
- * "our address" really is. Mostly it's not a problem; for this we just guess
- * a sensible default. Eventually the routing code will take care of all the
- * nasties for us I hope.
- */
-int dn_dev_bind_default(__le16 *addr)
-{
-	struct net_device *dev;
-	int rv;
-	dev = dn_dev_get_default();
-last_chance:
-	if (dev) {
-		rv = dn_dev_get_first(dev, addr);
-		dev_put(dev);
-		if (rv == 0 || dev == init_net.loopback_dev)
-			return rv;
-	}
-	dev = init_net.loopback_dev;
-	dev_hold(dev);
-	goto last_chance;
-}
-
-static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
-	struct endnode_hello_message *msg;
-	struct sk_buff *skb = NULL;
-	__le16 *pktlen;
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
-	if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
-		return;
-
-	skb->dev = dev;
-
-	msg = skb_put(skb, sizeof(*msg));
-
-	msg->msgflg  = 0x0D;
-	memcpy(msg->tiver, dn_eco_version, 3);
-	dn_dn2eth(msg->id, ifa->ifa_local);
-	msg->iinfo   = DN_RT_INFO_ENDN;
-	msg->blksize = cpu_to_le16(mtu2blksize(dev));
-	msg->area    = 0x00;
-	memset(msg->seed, 0, 8);
-	memcpy(msg->neighbor, dn_hiord, ETH_ALEN);
-
-	if (dn_db->router) {
-		struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n);
-		dn_dn2eth(msg->neighbor, dn->addr);
-	}
-
-	msg->timer   = cpu_to_le16((unsigned short)dn_db->parms.t3);
-	msg->mpd     = 0x00;
-	msg->datalen = 0x02;
-	memset(msg->data, 0xAA, 2);
-
-	pktlen = skb_push(skb, 2);
-	*pktlen = cpu_to_le16(skb->len - 2);
-
-	skb_reset_network_header(skb);
-
-	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id);
-}
-
-
-#define DRDELAY (5 * HZ)
-
-static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
-	/* First check time since device went up */
-	if (time_before(jiffies, dn_db->uptime + DRDELAY))
-		return 0;
-
-	/* If there is no router, then yes... */
-	if (!dn_db->router)
-		return 1;
-
-	/* otherwise only if we have a higher priority or.. */
-	if (dn->priority < dn_db->parms.priority)
-		return 1;
-
-	/* if we have equal priority and a higher node number */
-	if (dn->priority != dn_db->parms.priority)
-		return 0;
-
-	if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local))
-		return 1;
-
-	return 0;
-}
-
-static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
-	int n;
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-	struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n);
-	struct sk_buff *skb;
-	size_t size;
-	unsigned char *ptr;
-	unsigned char *i1, *i2;
-	__le16 *pktlen;
-	char *src;
-
-	if (mtu2blksize(dev) < (26 + 7))
-		return;
-
-	n = mtu2blksize(dev) - 26;
-	n /= 7;
-
-	if (n > 32)
-		n = 32;
-
-	size = 2 + 26 + 7 * n;
-
-	if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL)
-		return;
-
-	skb->dev = dev;
-	ptr = skb_put(skb, size);
-
-	*ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH;
-	*ptr++ = 2; /* ECO */
-	*ptr++ = 0;
-	*ptr++ = 0;
-	dn_dn2eth(ptr, ifa->ifa_local);
-	src = ptr;
-	ptr += ETH_ALEN;
-	*ptr++ = dn_db->parms.forwarding == 1 ?
-			DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
-	*((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev));
-	ptr += 2;
-	*ptr++ = dn_db->parms.priority; /* Priority */
-	*ptr++ = 0; /* Area: Reserved */
-	*((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3);
-	ptr += 2;
-	*ptr++ = 0; /* MPD: Reserved */
-	i1 = ptr++;
-	memset(ptr, 0, 7); /* Name: Reserved */
-	ptr += 7;
-	i2 = ptr++;
-
-	n = dn_neigh_elist(dev, ptr, n);
-
-	*i2 = 7 * n;
-	*i1 = 8 + *i2;
-
-	skb_trim(skb, (27 + *i2));
-
-	pktlen = skb_push(skb, 2);
-	*pktlen = cpu_to_le16(skb->len - 2);
-
-	skb_reset_network_header(skb);
-
-	if (dn_am_i_a_router(dn, dn_db, ifa)) {
-		struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
-		if (skb2) {
-			dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src);
-		}
-	}
-
-	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
-	if (dn_db->parms.forwarding == 0)
-		dn_send_endnode_hello(dev, ifa);
-	else
-		dn_send_router_hello(dev, ifa);
-}
-
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
-	int tdlen = 16;
-	int size = dev->hard_header_len + 2 + 4 + tdlen;
-	struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC);
-	int i;
-	unsigned char *ptr;
-	char src[ETH_ALEN];
-
-	if (skb == NULL)
-		return ;
-
-	skb->dev = dev;
-	skb_push(skb, dev->hard_header_len);
-	ptr = skb_put(skb, 2 + 4 + tdlen);
-
-	*ptr++ = DN_RT_PKT_HELO;
-	*((__le16 *)ptr) = ifa->ifa_local;
-	ptr += 2;
-	*ptr++ = tdlen;
-
-	for(i = 0; i < tdlen; i++)
-		*ptr++ = 0252;
-
-	dn_dn2eth(src, ifa->ifa_local);
-	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static int dn_eth_up(struct net_device *dev)
-{
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
-	if (dn_db->parms.forwarding == 0)
-		dev_mc_add(dev, dn_rt_all_end_mcast);
-	else
-		dev_mc_add(dev, dn_rt_all_rt_mcast);
-
-	dn_db->use_long = 1;
-
-	return 0;
-}
-
-static void dn_eth_down(struct net_device *dev)
-{
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
-	if (dn_db->parms.forwarding == 0)
-		dev_mc_del(dev, dn_rt_all_end_mcast);
-	else
-		dev_mc_del(dev, dn_rt_all_rt_mcast);
-}
-
-static void dn_dev_set_timer(struct net_device *dev);
-
-static void dn_dev_timer_func(struct timer_list *t)
-{
-	struct dn_dev *dn_db = from_timer(dn_db, t, timer);
-	struct net_device *dev;
-	struct dn_ifaddr *ifa;
-
-	rcu_read_lock();
-	dev = dn_db->dev;
-	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
-			for (ifa = rcu_dereference(dn_db->ifa_list);
-			     ifa;
-			     ifa = rcu_dereference(ifa->ifa_next)) {
-				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
-			}
-		}
-		dn_db->t3 = dn_db->parms.t3;
-	} else {
-		dn_db->t3 -= dn_db->parms.t2;
-	}
-	rcu_read_unlock();
-	dn_dev_set_timer(dev);
-}
-
-static void dn_dev_set_timer(struct net_device *dev)
-{
-	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
-	if (dn_db->parms.t2 > dn_db->parms.t3)
-		dn_db->parms.t2 = dn_db->parms.t3;
-
-	dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
-
-	add_timer(&dn_db->timer);
-}
-
-static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
-{
-	int i;
-	struct dn_dev_parms *p = dn_dev_list;
-	struct dn_dev *dn_db;
-
-	for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) {
-		if (p->type == dev->type)
-			break;
-	}
-
-	*err = -ENODEV;
-	if (i == DN_DEV_LIST_SIZE)
-		return NULL;
-
-	*err = -ENOBUFS;
-	if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL)
-		return NULL;
-
-	memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
-
-	rcu_assign_pointer(dev->dn_ptr, dn_db);
-	dn_db->dev = dev;
-	timer_setup(&dn_db->timer, dn_dev_timer_func, 0);
-
-	dn_db->uptime = jiffies;
-
-	dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table);
-	if (!dn_db->neigh_parms) {
-		RCU_INIT_POINTER(dev->dn_ptr, NULL);
-		kfree(dn_db);
-		return NULL;
-	}
-
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
-			neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
-			dev->dn_ptr = NULL;
-			kfree(dn_db);
-			return NULL;
-		}
-	}
-
-	dn_dev_sysctl_register(dev, &dn_db->parms);
-
-	dn_dev_set_timer(dev);
-
-	*err = 0;
-	return dn_db;
-}
-
-
-/*
- * This processes a device up event. We only start up
- * the loopback device & ethernet devices with correct
- * MAC addresses automatically. Others must be started
- * specifically.
- *
- * FIXME: How should we configure the loopback address ? If we could dispense
- * with using decnet_address here and for autobind, it will be one less thing
- * for users to worry about setting up.
- */
-
-void dn_dev_up(struct net_device *dev)
-{
-	struct dn_ifaddr *ifa;
-	__le16 addr = decnet_address;
-	int maybe_default = 0;
-	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-
-	if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
-		return;
-
-	/*
-	 * Need to ensure that loopback device has a dn_db attached to it
-	 * to allow creation of neighbours against it, even though it might
-	 * not have a local address of its own. Might as well do the same for
-	 * all autoconfigured interfaces.
-	 */
-	if (dn_db == NULL) {
-		int err;
-		dn_db = dn_dev_create(dev, &err);
-		if (dn_db == NULL)
-			return;
-	}
-
-	if (dev->type == ARPHRD_ETHER) {
-		if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
-			return;
-		addr = dn_eth2dn(dev->dev_addr);
-		maybe_default = 1;
-	}
-
-	if (addr == 0)
-		return;
-
-	if ((ifa = dn_dev_alloc_ifa()) == NULL)
-		return;
-
-	ifa->ifa_local = ifa->ifa_address = addr;
-	ifa->ifa_flags = 0;
-	ifa->ifa_scope = RT_SCOPE_UNIVERSE;
-	strcpy(ifa->ifa_label, dev->name);
-
-	dn_dev_set_ifa(dev, ifa);
-
-	/*
-	 * Automagically set the default device to the first automatically
-	 * configured ethernet card in the system.
-	 */
-	if (maybe_default) {
-		dev_hold(dev);
-		if (dn_dev_set_default(dev, 0))
-			dev_put(dev);
-	}
-}
-
-static void dn_dev_delete(struct net_device *dev)
-{
-	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-
-	if (dn_db == NULL)
-		return;
-
-	del_timer_sync(&dn_db->timer);
-	dn_dev_sysctl_unregister(&dn_db->parms);
-	dn_dev_check_default(dev);
-	neigh_ifdown(&dn_neigh_table, dev);
-
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
-
-	dev->dn_ptr = NULL;
-
-	neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
-	neigh_ifdown(&dn_neigh_table, dev);
-
-	if (dn_db->router)
-		neigh_release(dn_db->router);
-	if (dn_db->peer)
-		neigh_release(dn_db->peer);
-
-	kfree(dn_db);
-}
-
-void dn_dev_down(struct net_device *dev)
-{
-	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-	struct dn_ifaddr *ifa;
-
-	if (dn_db == NULL)
-		return;
-
-	while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) {
-		dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0);
-		dn_dev_free_ifa(ifa);
-	}
-
-	dn_dev_delete(dev);
-}
-
-void dn_dev_init_pkt(struct sk_buff *skb)
-{
-}
-
-void dn_dev_veri_pkt(struct sk_buff *skb)
-{
-}
-
-void dn_dev_hello(struct sk_buff *skb)
-{
-}
-
-void dn_dev_devices_off(void)
-{
-	struct net_device *dev;
-
-	rtnl_lock();
-	for_each_netdev(&init_net, dev)
-		dn_dev_down(dev);
-	rtnl_unlock();
-
-}
-
-void dn_dev_devices_on(void)
-{
-	struct net_device *dev;
-
-	rtnl_lock();
-	for_each_netdev(&init_net, dev) {
-		if (dev->flags & IFF_UP)
-			dn_dev_up(dev);
-	}
-	rtnl_unlock();
-}
-
-int register_dnaddr_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&dnaddr_chain, nb);
-}
-
-int unregister_dnaddr_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
-}
-
-#ifdef CONFIG_PROC_FS
-static inline int is_dn_dev(struct net_device *dev)
-{
-	return dev->dn_ptr != NULL;
-}
-
-static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(RCU)
-{
-	int i;
-	struct net_device *dev;
-
-	rcu_read_lock();
-
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-
-	i = 1;
-	for_each_netdev_rcu(&init_net, dev) {
-		if (!is_dn_dev(dev))
-			continue;
-
-		if (i++ == *pos)
-			return dev;
-	}
-
-	return NULL;
-}
-
-static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct net_device *dev;
-
-	++*pos;
-
-	dev = v;
-	if (v == SEQ_START_TOKEN)
-		dev = net_device_entry(&init_net.dev_base_head);
-
-	for_each_netdev_continue_rcu(&init_net, dev) {
-		if (!is_dn_dev(dev))
-			continue;
-
-		return dev;
-	}
-
-	return NULL;
-}
-
-static void dn_dev_seq_stop(struct seq_file *seq, void *v)
-	__releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static char *dn_type2asc(char type)
-{
-	switch (type) {
-	case DN_DEV_BCAST:
-		return "B";
-	case DN_DEV_UCAST:
-		return "U";
-	case DN_DEV_MPOINT:
-		return "M";
-	}
-
-	return "?";
-}
-
-static int dn_dev_seq_show(struct seq_file *seq, void *v)
-{
-	if (v == SEQ_START_TOKEN)
-		seq_puts(seq, "Name     Flags T1   Timer1 T3   Timer3 BlkSize Pri State DevType    Router Peer\n");
-	else {
-		struct net_device *dev = v;
-		char peer_buf[DN_ASCBUF_LEN];
-		char router_buf[DN_ASCBUF_LEN];
-		struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr);
-
-		seq_printf(seq, "%-8s %1s     %04u %04u   %04lu %04lu"
-				"   %04hu    %03d %02x    %-10s %-7s %-7s\n",
-				dev->name,
-				dn_type2asc(dn_db->parms.mode),
-				0, 0,
-				dn_db->t3, dn_db->parms.t3,
-				mtu2blksize(dev),
-				dn_db->parms.priority,
-				dn_db->parms.state, dn_db->parms.name,
-				dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
-				dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
-	}
-	return 0;
-}
-
-static const struct seq_operations dn_dev_seq_ops = {
-	.start	= dn_dev_seq_start,
-	.next	= dn_dev_seq_next,
-	.stop	= dn_dev_seq_stop,
-	.show	= dn_dev_seq_show,
-};
-#endif /* CONFIG_PROC_FS */
-
-static int addr[2];
-module_param_array(addr, int, NULL, 0444);
-MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node");
-
-void __init dn_dev_init(void)
-{
-	if (addr[0] > 63 || addr[0] < 0) {
-		printk(KERN_ERR "DECnet: Area must be between 0 and 63");
-		return;
-	}
-
-	if (addr[1] > 1023 || addr[1] < 0) {
-		printk(KERN_ERR "DECnet: Node must be between 0 and 1023");
-		return;
-	}
-
-	decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]);
-
-	dn_dev_devices_on();
-
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR,
-			     dn_nl_newaddr, NULL, 0);
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR,
-			     dn_nl_deladdr, NULL, 0);
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
-			     NULL, dn_nl_dump_ifaddr, 0);
-
-	proc_create_seq("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_ops);
-
-#ifdef CONFIG_SYSCTL
-	{
-		int i;
-		for(i = 0; i < DN_DEV_LIST_SIZE; i++)
-			dn_dev_sysctl_register(NULL, &dn_dev_list[i]);
-	}
-#endif /* CONFIG_SYSCTL */
-}
-
-void __exit dn_dev_cleanup(void)
-{
-#ifdef CONFIG_SYSCTL
-	{
-		int i;
-		for(i = 0; i < DN_DEV_LIST_SIZE; i++)
-			dn_dev_sysctl_unregister(&dn_dev_list[i]);
-	}
-#endif /* CONFIG_SYSCTL */
-
-	remove_proc_entry("decnet_dev", init_net.proc_net);
-
-	dn_dev_devices_off();
-}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
deleted file mode 100644
index 269c029ad74f..000000000000
--- a/net/decnet/dn_fib.c
+++ /dev/null
@@ -1,798 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Routing Forwarding Information Base (Glue/Info List)
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- *              Alexey Kuznetsov : SMP locking changes
- *              Steve Whitehouse : Rewrote it... Well to be more correct, I
- *                                 copied most of it from the ipv4 fib code.
- *              Steve Whitehouse : Updated it in style and fixed a few bugs
- *                                 which were fixed in the ipv4 code since
- *                                 this code was copied from it.
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-#include <net/rtnh.h>
-
-#define RT_MIN_TABLE 1
-
-#define for_fib_info() { struct dn_fib_info *fi;\
-	for(fi = dn_fib_info_list; fi; fi = fi->fib_next)
-#define endfor_fib_info() }
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
-	for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\
-	for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-static DEFINE_SPINLOCK(dn_fib_multipath_lock);
-static struct dn_fib_info *dn_fib_info_list;
-static DEFINE_SPINLOCK(dn_fib_info_lock);
-
-static struct
-{
-	int error;
-	u8 scope;
-} dn_fib_props[RTN_MAX+1] = {
-	[RTN_UNSPEC] =      { .error = 0,       .scope = RT_SCOPE_NOWHERE },
-	[RTN_UNICAST] =     { .error = 0,       .scope = RT_SCOPE_UNIVERSE },
-	[RTN_LOCAL] =       { .error = 0,       .scope = RT_SCOPE_HOST },
-	[RTN_BROADCAST] =   { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-	[RTN_ANYCAST] =     { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-	[RTN_MULTICAST] =   { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-	[RTN_BLACKHOLE] =   { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE },
-	[RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE },
-	[RTN_PROHIBIT] =    { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE },
-	[RTN_THROW] =       { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE },
-	[RTN_NAT] =         { .error = 0,       .scope = RT_SCOPE_NOWHERE },
-	[RTN_XRESOLVE] =    { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-};
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
-static int dn_fib_sync_up(struct net_device *dev);
-
-void dn_fib_free_info(struct dn_fib_info *fi)
-{
-	if (fi->fib_dead == 0) {
-		printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n");
-		return;
-	}
-
-	change_nexthops(fi) {
-		dev_put(nh->nh_dev);
-		nh->nh_dev = NULL;
-	} endfor_nexthops(fi);
-	kfree(fi);
-}
-
-void dn_fib_release_info(struct dn_fib_info *fi)
-{
-	spin_lock(&dn_fib_info_lock);
-	if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
-		if (fi->fib_next)
-			fi->fib_next->fib_prev = fi->fib_prev;
-		if (fi->fib_prev)
-			fi->fib_prev->fib_next = fi->fib_next;
-		if (fi == dn_fib_info_list)
-			dn_fib_info_list = fi->fib_next;
-		fi->fib_dead = 1;
-		dn_fib_info_put(fi);
-	}
-	spin_unlock(&dn_fib_info_lock);
-}
-
-static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi)
-{
-	const struct dn_fib_nh *onh = ofi->fib_nh;
-
-	for_nexthops(fi) {
-		if (nh->nh_oif != onh->nh_oif ||
-			nh->nh_gw != onh->nh_gw ||
-			nh->nh_scope != onh->nh_scope ||
-			nh->nh_weight != onh->nh_weight ||
-			((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
-				return -1;
-		onh++;
-	} endfor_nexthops(fi);
-	return 0;
-}
-
-static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi)
-{
-	for_fib_info() {
-		if (fi->fib_nhs != nfi->fib_nhs)
-			continue;
-		if (nfi->fib_protocol == fi->fib_protocol &&
-			nfi->fib_prefsrc == fi->fib_prefsrc &&
-			nfi->fib_priority == fi->fib_priority &&
-			memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
-			((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
-			(nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0))
-				return fi;
-	} endfor_fib_info();
-	return NULL;
-}
-
-static int dn_fib_count_nhs(const struct nlattr *attr)
-{
-	struct rtnexthop *nhp = nla_data(attr);
-	int nhs = 0, nhlen = nla_len(attr);
-
-	while (rtnh_ok(nhp, nhlen)) {
-		nhs++;
-		nhp = rtnh_next(nhp, &nhlen);
-	}
-
-	/* leftover implies invalid nexthop configuration, discard it */
-	return nhlen > 0 ? 0 : nhs;
-}
-
-static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
-			  const struct rtmsg *r)
-{
-	struct rtnexthop *nhp = nla_data(attr);
-	int nhlen = nla_len(attr);
-
-	change_nexthops(fi) {
-		int attrlen;
-
-		if (!rtnh_ok(nhp, nhlen))
-			return -EINVAL;
-
-		nh->nh_flags  = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
-		nh->nh_oif    = nhp->rtnh_ifindex;
-		nh->nh_weight = nhp->rtnh_hops + 1;
-
-		attrlen = rtnh_attrlen(nhp);
-		if (attrlen > 0) {
-			struct nlattr *gw_attr;
-
-			gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
-			nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
-		}
-
-		nhp = rtnh_next(nhp, &nhlen);
-	} endfor_nexthops(fi);
-
-	return 0;
-}
-
-
-static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh)
-{
-	int err;
-
-	if (nh->nh_gw) {
-		struct flowidn fld;
-		struct dn_fib_res res;
-
-		if (nh->nh_flags&RTNH_F_ONLINK) {
-			struct net_device *dev;
-
-			if (r->rtm_scope >= RT_SCOPE_LINK)
-				return -EINVAL;
-			if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST)
-				return -EINVAL;
-			if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
-				return -ENODEV;
-			if (!(dev->flags&IFF_UP))
-				return -ENETDOWN;
-			nh->nh_dev = dev;
-			dev_hold(dev);
-			nh->nh_scope = RT_SCOPE_LINK;
-			return 0;
-		}
-
-		memset(&fld, 0, sizeof(fld));
-		fld.daddr = nh->nh_gw;
-		fld.flowidn_oif = nh->nh_oif;
-		fld.flowidn_scope = r->rtm_scope + 1;
-
-		if (fld.flowidn_scope < RT_SCOPE_LINK)
-			fld.flowidn_scope = RT_SCOPE_LINK;
-
-		if ((err = dn_fib_lookup(&fld, &res)) != 0)
-			return err;
-
-		err = -EINVAL;
-		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
-			goto out;
-		nh->nh_scope = res.scope;
-		nh->nh_oif = DN_FIB_RES_OIF(res);
-		nh->nh_dev = DN_FIB_RES_DEV(res);
-		if (nh->nh_dev == NULL)
-			goto out;
-		dev_hold(nh->nh_dev);
-		err = -ENETDOWN;
-		if (!(nh->nh_dev->flags & IFF_UP))
-			goto out;
-		err = 0;
-out:
-		dn_fib_res_put(&res);
-		return err;
-	} else {
-		struct net_device *dev;
-
-		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
-			return -EINVAL;
-
-		dev = __dev_get_by_index(&init_net, nh->nh_oif);
-		if (dev == NULL || dev->dn_ptr == NULL)
-			return -ENODEV;
-		if (!(dev->flags&IFF_UP))
-			return -ENETDOWN;
-		nh->nh_dev = dev;
-		dev_hold(nh->nh_dev);
-		nh->nh_scope = RT_SCOPE_HOST;
-	}
-
-	return 0;
-}
-
-
-struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[],
-				       const struct nlmsghdr *nlh, int *errp)
-{
-	int err;
-	struct dn_fib_info *fi = NULL;
-	struct dn_fib_info *ofi;
-	int nhs = 1;
-
-	if (r->rtm_type > RTN_MAX)
-		goto err_inval;
-
-	if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
-		goto err_inval;
-
-	if (attrs[RTA_MULTIPATH] &&
-	    (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0)
-		goto err_inval;
-
-	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
-	err = -ENOBUFS;
-	if (fi == NULL)
-		goto failure;
-
-	fi->fib_protocol = r->rtm_protocol;
-	fi->fib_nhs = nhs;
-	fi->fib_flags = r->rtm_flags;
-
-	if (attrs[RTA_PRIORITY])
-		fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]);
-
-	if (attrs[RTA_METRICS]) {
-		struct nlattr *attr;
-		int rem;
-
-		nla_for_each_nested(attr, attrs[RTA_METRICS], rem) {
-			int type = nla_type(attr);
-
-			if (type) {
-				if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
-				    nla_len(attr) < 4)
-					goto err_inval;
-
-				fi->fib_metrics[type-1] = nla_get_u32(attr);
-			}
-		}
-	}
-
-	if (attrs[RTA_PREFSRC])
-		fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]);
-
-	if (attrs[RTA_MULTIPATH]) {
-		if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0)
-			goto failure;
-
-		if (attrs[RTA_OIF] &&
-		    fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF]))
-			goto err_inval;
-
-		if (attrs[RTA_GATEWAY] &&
-		    fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY]))
-			goto err_inval;
-	} else {
-		struct dn_fib_nh *nh = fi->fib_nh;
-
-		if (attrs[RTA_OIF])
-			nh->nh_oif = nla_get_u32(attrs[RTA_OIF]);
-
-		if (attrs[RTA_GATEWAY])
-			nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
-
-		nh->nh_flags = r->rtm_flags;
-		nh->nh_weight = 1;
-	}
-
-	if (r->rtm_type == RTN_NAT) {
-		if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF])
-			goto err_inval;
-
-		fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
-		goto link_it;
-	}
-
-	if (dn_fib_props[r->rtm_type].error) {
-		if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH])
-			goto err_inval;
-
-		goto link_it;
-	}
-
-	if (r->rtm_scope > RT_SCOPE_HOST)
-		goto err_inval;
-
-	if (r->rtm_scope == RT_SCOPE_HOST) {
-		struct dn_fib_nh *nh = fi->fib_nh;
-
-		/* Local address is added */
-		if (nhs != 1 || nh->nh_gw)
-			goto err_inval;
-		nh->nh_scope = RT_SCOPE_NOWHERE;
-		nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
-		err = -ENODEV;
-		if (nh->nh_dev == NULL)
-			goto failure;
-	} else {
-		change_nexthops(fi) {
-			if ((err = dn_fib_check_nh(r, fi, nh)) != 0)
-				goto failure;
-		} endfor_nexthops(fi)
-	}
-
-	if (fi->fib_prefsrc) {
-		if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] ||
-		    fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST]))
-			if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
-				goto err_inval;
-	}
-
-link_it:
-	if ((ofi = dn_fib_find_info(fi)) != NULL) {
-		fi->fib_dead = 1;
-		dn_fib_free_info(fi);
-		refcount_inc(&ofi->fib_treeref);
-		return ofi;
-	}
-
-	refcount_set(&fi->fib_treeref, 1);
-	refcount_set(&fi->fib_clntref, 1);
-	spin_lock(&dn_fib_info_lock);
-	fi->fib_next = dn_fib_info_list;
-	fi->fib_prev = NULL;
-	if (dn_fib_info_list)
-		dn_fib_info_list->fib_prev = fi;
-	dn_fib_info_list = fi;
-	spin_unlock(&dn_fib_info_lock);
-	return fi;
-
-err_inval:
-	err = -EINVAL;
-
-failure:
-	*errp = err;
-	if (fi) {
-		fi->fib_dead = 1;
-		dn_fib_free_info(fi);
-	}
-
-	return NULL;
-}
-
-int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res)
-{
-	int err = dn_fib_props[type].error;
-
-	if (err == 0) {
-		if (fi->fib_flags & RTNH_F_DEAD)
-			return 1;
-
-		res->fi = fi;
-
-		switch (type) {
-		case RTN_NAT:
-			DN_FIB_RES_RESET(*res);
-			refcount_inc(&fi->fib_clntref);
-			return 0;
-		case RTN_UNICAST:
-		case RTN_LOCAL:
-			for_nexthops(fi) {
-				if (nh->nh_flags & RTNH_F_DEAD)
-					continue;
-				if (!fld->flowidn_oif ||
-				    fld->flowidn_oif == nh->nh_oif)
-					break;
-			}
-			if (nhsel < fi->fib_nhs) {
-				res->nh_sel = nhsel;
-				refcount_inc(&fi->fib_clntref);
-				return 0;
-			}
-			endfor_nexthops(fi);
-			res->fi = NULL;
-			return 1;
-		default:
-			net_err_ratelimited("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n",
-					    type);
-			res->fi = NULL;
-			return -EINVAL;
-		}
-	}
-	return err;
-}
-
-void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
-{
-	struct dn_fib_info *fi = res->fi;
-	int w;
-
-	spin_lock_bh(&dn_fib_multipath_lock);
-	if (fi->fib_power <= 0) {
-		int power = 0;
-		change_nexthops(fi) {
-			if (!(nh->nh_flags&RTNH_F_DEAD)) {
-				power += nh->nh_weight;
-				nh->nh_power = nh->nh_weight;
-			}
-		} endfor_nexthops(fi);
-		fi->fib_power = power;
-		if (power < 0) {
-			spin_unlock_bh(&dn_fib_multipath_lock);
-			res->nh_sel = 0;
-			return;
-		}
-	}
-
-	w = jiffies % fi->fib_power;
-
-	change_nexthops(fi) {
-		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
-			if ((w -= nh->nh_power) <= 0) {
-				nh->nh_power--;
-				fi->fib_power--;
-				res->nh_sel = nhsel;
-				spin_unlock_bh(&dn_fib_multipath_lock);
-				return;
-			}
-		}
-	} endfor_nexthops(fi);
-	res->nh_sel = 0;
-	spin_unlock_bh(&dn_fib_multipath_lock);
-}
-
-static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table)
-{
-	if (attrs[RTA_TABLE])
-		table = nla_get_u32(attrs[RTA_TABLE]);
-
-	return table;
-}
-
-static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
-			       struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(skb->sk);
-	struct dn_fib_table *tb;
-	struct rtmsg *r = nlmsg_data(nlh);
-	struct nlattr *attrs[RTA_MAX+1];
-	int err;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!net_eq(net, &init_net))
-		return -EINVAL;
-
-	err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX,
-				     rtm_dn_policy, extack);
-	if (err < 0)
-		return err;
-
-	tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0);
-	if (!tb)
-		return -ESRCH;
-
-	return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb));
-}
-
-static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
-			       struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(skb->sk);
-	struct dn_fib_table *tb;
-	struct rtmsg *r = nlmsg_data(nlh);
-	struct nlattr *attrs[RTA_MAX+1];
-	int err;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!net_eq(net, &init_net))
-		return -EINVAL;
-
-	err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX,
-				     rtm_dn_policy, extack);
-	if (err < 0)
-		return err;
-
-	tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1);
-	if (!tb)
-		return -ENOBUFS;
-
-	return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb));
-}
-
-static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
-{
-	struct dn_fib_table *tb;
-	struct {
-		struct nlmsghdr nlh;
-		struct rtmsg rtm;
-	} req;
-	struct {
-		struct nlattr hdr;
-		__le16 dst;
-	} dst_attr = {
-		.dst = dst,
-	};
-	struct {
-		struct nlattr hdr;
-		__le16 prefsrc;
-	} prefsrc_attr = {
-		.prefsrc = ifa->ifa_local,
-	};
-	struct {
-		struct nlattr hdr;
-		u32 oif;
-	} oif_attr = {
-		.oif = ifa->ifa_dev->dev->ifindex,
-	};
-	struct nlattr *attrs[RTA_MAX+1] = {
-		[RTA_DST] = (struct nlattr *) &dst_attr,
-		[RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr,
-		[RTA_OIF] = (struct nlattr *) &oif_attr,
-	};
-
-	memset(&req.rtm, 0, sizeof(req.rtm));
-
-	if (type == RTN_UNICAST)
-		tb = dn_fib_get_table(RT_MIN_TABLE, 1);
-	else
-		tb = dn_fib_get_table(RT_TABLE_LOCAL, 1);
-
-	if (tb == NULL)
-		return;
-
-	req.nlh.nlmsg_len = sizeof(req);
-	req.nlh.nlmsg_type = cmd;
-	req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
-	req.nlh.nlmsg_pid = 0;
-	req.nlh.nlmsg_seq = 0;
-
-	req.rtm.rtm_dst_len = dst_len;
-	req.rtm.rtm_table = tb->n;
-	req.rtm.rtm_protocol = RTPROT_KERNEL;
-	req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
-	req.rtm.rtm_type = type;
-
-	if (cmd == RTM_NEWROUTE)
-		tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL);
-	else
-		tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL);
-}
-
-static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
-{
-
-	fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
-#if 0
-	if (!(dev->flags&IFF_UP))
-		return;
-	/* In the future, we will want to add default routes here */
-
-#endif
-}
-
-static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
-{
-	int found_it = 0;
-	struct net_device *dev;
-	struct dn_dev *dn_db;
-	struct dn_ifaddr *ifa2;
-
-	ASSERT_RTNL();
-
-	/* Scan device list */
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev) {
-		dn_db = rcu_dereference(dev->dn_ptr);
-		if (dn_db == NULL)
-			continue;
-		for (ifa2 = rcu_dereference(dn_db->ifa_list);
-		     ifa2 != NULL;
-		     ifa2 = rcu_dereference(ifa2->ifa_next)) {
-			if (ifa2->ifa_local == ifa->ifa_local) {
-				found_it = 1;
-				break;
-			}
-		}
-	}
-	rcu_read_unlock();
-
-	if (found_it == 0) {
-		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
-		if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
-			if (dn_fib_sync_down(ifa->ifa_local, NULL, 0))
-				dn_fib_flush();
-		}
-	}
-}
-
-static void dn_fib_disable_addr(struct net_device *dev, int force)
-{
-	if (dn_fib_sync_down(0, dev, force))
-		dn_fib_flush();
-	dn_rt_cache_flush(0);
-	neigh_ifdown(&dn_neigh_table, dev);
-}
-
-static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr;
-
-	switch (event) {
-	case NETDEV_UP:
-		dn_fib_add_ifaddr(ifa);
-		dn_fib_sync_up(ifa->ifa_dev->dev);
-		dn_rt_cache_flush(-1);
-		break;
-	case NETDEV_DOWN:
-		dn_fib_del_ifaddr(ifa);
-		if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
-			dn_fib_disable_addr(ifa->ifa_dev->dev, 1);
-		} else {
-			dn_rt_cache_flush(-1);
-		}
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
-{
-	int ret = 0;
-	int scope = RT_SCOPE_NOWHERE;
-
-	if (force)
-		scope = -1;
-
-	for_fib_info() {
-		/*
-		 * This makes no sense for DECnet.... we will almost
-		 * certainly have more than one local address the same
-		 * over all our interfaces. It needs thinking about
-		 * some more.
-		 */
-		if (local && fi->fib_prefsrc == local) {
-			fi->fib_flags |= RTNH_F_DEAD;
-			ret++;
-		} else if (dev && fi->fib_nhs) {
-			int dead = 0;
-
-			change_nexthops(fi) {
-				if (nh->nh_flags&RTNH_F_DEAD)
-					dead++;
-				else if (nh->nh_dev == dev &&
-						nh->nh_scope != scope) {
-					spin_lock_bh(&dn_fib_multipath_lock);
-					nh->nh_flags |= RTNH_F_DEAD;
-					fi->fib_power -= nh->nh_power;
-					nh->nh_power = 0;
-					spin_unlock_bh(&dn_fib_multipath_lock);
-					dead++;
-				}
-			} endfor_nexthops(fi)
-			if (dead == fi->fib_nhs) {
-				fi->fib_flags |= RTNH_F_DEAD;
-				ret++;
-			}
-		}
-	} endfor_fib_info();
-	return ret;
-}
-
-
-static int dn_fib_sync_up(struct net_device *dev)
-{
-	int ret = 0;
-
-	if (!(dev->flags&IFF_UP))
-		return 0;
-
-	for_fib_info() {
-		int alive = 0;
-
-		change_nexthops(fi) {
-			if (!(nh->nh_flags&RTNH_F_DEAD)) {
-				alive++;
-				continue;
-			}
-			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
-				continue;
-			if (nh->nh_dev != dev || dev->dn_ptr == NULL)
-				continue;
-			alive++;
-			spin_lock_bh(&dn_fib_multipath_lock);
-			nh->nh_power = 0;
-			nh->nh_flags &= ~RTNH_F_DEAD;
-			spin_unlock_bh(&dn_fib_multipath_lock);
-		} endfor_nexthops(fi);
-
-		if (alive > 0) {
-			fi->fib_flags &= ~RTNH_F_DEAD;
-			ret++;
-		}
-	} endfor_fib_info();
-	return ret;
-}
-
-static struct notifier_block dn_fib_dnaddr_notifier = {
-	.notifier_call = dn_fib_dnaddr_event,
-};
-
-void __exit dn_fib_cleanup(void)
-{
-	dn_fib_table_cleanup();
-	dn_fib_rules_cleanup();
-
-	unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-}
-
-
-void __init dn_fib_init(void)
-{
-	dn_fib_table_init();
-	dn_fib_rules_init();
-
-	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE,
-			     dn_fib_rtm_newroute, NULL, 0);
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
-			     dn_fib_rtm_delroute, NULL, 0);
-}
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
deleted file mode 100644
index 7c569bcc0aca..000000000000
--- a/net/decnet/dn_neigh.c
+++ /dev/null
@@ -1,607 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Neighbour Functions (Adjacency Database and
- *                                                        On-Ethernet Cache)
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- *     Steve Whitehouse     : Fixed router listing routine
- *     Steve Whitehouse     : Added error_report functions
- *     Steve Whitehouse     : Added default router detection
- *     Steve Whitehouse     : Hop counts in outgoing messages
- *     Steve Whitehouse     : Fixed src/dst in outgoing messages so
- *                            forwarding now stands a good chance of
- *                            working.
- *     Steve Whitehouse     : Fixed neighbour states (for now anyway).
- *     Steve Whitehouse     : Made error_report functions dummies. This
- *                            is not the right place to return skbs.
- *     Steve Whitehouse     : Convert to seq_file
- *
- */
-
-#include <linux/net.h>
-#include <linux/module.h>
-#include <linux/socket.h>
-#include <linux/if_arp.h>
-#include <linux/slab.h>
-#include <linux/if_ether.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/spinlock.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/jhash.h>
-#include <linux/atomic.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_neigh.h>
-#include <net/dn_route.h>
-
-static int dn_neigh_construct(struct neighbour *);
-static void dn_neigh_error_report(struct neighbour *, struct sk_buff *);
-static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb);
-
-/*
- * Operations for adding the link layer header.
- */
-static const struct neigh_ops dn_neigh_ops = {
-	.family =		AF_DECnet,
-	.error_report =		dn_neigh_error_report,
-	.output =		dn_neigh_output,
-	.connected_output =	dn_neigh_output,
-};
-
-static u32 dn_neigh_hash(const void *pkey,
-			 const struct net_device *dev,
-			 __u32 *hash_rnd)
-{
-	return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]);
-}
-
-static bool dn_key_eq(const struct neighbour *neigh, const void *pkey)
-{
-	return neigh_key_eq16(neigh, pkey);
-}
-
-struct neigh_table dn_neigh_table = {
-	.family =			PF_DECnet,
-	.entry_size =			NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)),
-	.key_len =			sizeof(__le16),
-	.protocol =			cpu_to_be16(ETH_P_DNA_RT),
-	.hash =				dn_neigh_hash,
-	.key_eq =			dn_key_eq,
-	.constructor =			dn_neigh_construct,
-	.id =				"dn_neigh_cache",
-	.parms ={
-		.tbl =			&dn_neigh_table,
-		.reachable_time =	30 * HZ,
-		.data = {
-			[NEIGH_VAR_MCAST_PROBES] = 0,
-			[NEIGH_VAR_UCAST_PROBES] = 0,
-			[NEIGH_VAR_APP_PROBES] = 0,
-			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
-			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
-			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
-			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
-			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
-			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
-			[NEIGH_VAR_PROXY_QLEN] = 0,
-			[NEIGH_VAR_ANYCAST_DELAY] = 0,
-			[NEIGH_VAR_PROXY_DELAY] = 0,
-			[NEIGH_VAR_LOCKTIME] = 1 * HZ,
-		},
-	},
-	.gc_interval =			30 * HZ,
-	.gc_thresh1 =			128,
-	.gc_thresh2 =			512,
-	.gc_thresh3 =			1024,
-};
-
-static int dn_neigh_construct(struct neighbour *neigh)
-{
-	struct net_device *dev = neigh->dev;
-	struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
-	struct dn_dev *dn_db;
-	struct neigh_parms *parms;
-
-	rcu_read_lock();
-	dn_db = rcu_dereference(dev->dn_ptr);
-	if (dn_db == NULL) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-
-	parms = dn_db->neigh_parms;
-	if (!parms) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-
-	__neigh_parms_put(neigh->parms);
-	neigh->parms = neigh_parms_clone(parms);
-	rcu_read_unlock();
-
-	neigh->ops = &dn_neigh_ops;
-	neigh->nud_state = NUD_NOARP;
-	neigh->output = neigh->ops->connected_output;
-
-	if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT))
-		memcpy(neigh->ha, dev->broadcast, dev->addr_len);
-	else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK))
-		dn_dn2eth(neigh->ha, dn->addr);
-	else {
-		net_dbg_ratelimited("Trying to create neigh for hw %d\n",
-				    dev->type);
-		return -EINVAL;
-	}
-
-	/*
-	 * Make an estimate of the remote block size by assuming that its
-	 * two less then the device mtu, which it true for ethernet (and
-	 * other things which support long format headers) since there is
-	 * an extra length field (of 16 bits) which isn't part of the
-	 * ethernet headers and which the DECnet specs won't admit is part
-	 * of the DECnet routing headers either.
-	 *
-	 * If we over estimate here its no big deal, the NSP negotiations
-	 * will prevent us from sending packets which are too large for the
-	 * remote node to handle. In any case this figure is normally updated
-	 * by a hello message in most cases.
-	 */
-	dn->blksize = dev->mtu - 2;
-
-	return 0;
-}
-
-static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb)
-{
-	printk(KERN_DEBUG "dn_neigh_error_report: called\n");
-	kfree_skb(skb);
-}
-
-static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_route *rt = (struct dn_route *)dst;
-	struct net_device *dev = neigh->dev;
-	char mac_addr[ETH_ALEN];
-	unsigned int seq;
-	int err;
-
-	dn_dn2eth(mac_addr, rt->rt_local_src);
-	do {
-		seq = read_seqbegin(&neigh->ha_lock);
-		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-				      neigh->ha, mac_addr, skb->len);
-	} while (read_seqretry(&neigh->ha_lock, seq));
-
-	if (err >= 0)
-		err = dev_queue_xmit(skb);
-	else {
-		kfree_skb(skb);
-		err = -EINVAL;
-	}
-	return err;
-}
-
-static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_route *rt = (struct dn_route *)dst;
-	struct neighbour *neigh = rt->n;
-
-	return neigh->output(neigh, skb);
-}
-
-/*
- * For talking to broadcast devices: Ethernet & PPP
- */
-static int dn_long_output(struct neighbour *neigh, struct sock *sk,
-			  struct sk_buff *skb)
-{
-	struct net_device *dev = neigh->dev;
-	int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
-	unsigned char *data;
-	struct dn_long_packet *lp;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
-	if (skb_headroom(skb) < headroom) {
-		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
-		if (skb2 == NULL) {
-			net_crit_ratelimited("dn_long_output: no memory\n");
-			kfree_skb(skb);
-			return -ENOBUFS;
-		}
-		consume_skb(skb);
-		skb = skb2;
-		net_info_ratelimited("dn_long_output: Increasing headroom\n");
-	}
-
-	data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
-	lp = (struct dn_long_packet *)(data+3);
-
-	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
-	*(data + 2) = 1 | DN_RT_F_PF; /* Padding */
-
-	lp->msgflg   = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
-	lp->d_area   = lp->d_subarea = 0;
-	dn_dn2eth(lp->d_id, cb->dst);
-	lp->s_area   = lp->s_subarea = 0;
-	dn_dn2eth(lp->s_id, cb->src);
-	lp->nl2      = 0;
-	lp->visit_ct = cb->hops & 0x3f;
-	lp->s_class  = 0;
-	lp->pt       = 0;
-
-	skb_reset_network_header(skb);
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
-		       &init_net, sk, skb, NULL, neigh->dev,
-		       dn_neigh_output_packet);
-}
-
-/*
- * For talking to pointopoint and multidrop devices: DDCMP and X.25
- */
-static int dn_short_output(struct neighbour *neigh, struct sock *sk,
-			   struct sk_buff *skb)
-{
-	struct net_device *dev = neigh->dev;
-	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
-	struct dn_short_packet *sp;
-	unsigned char *data;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
-	if (skb_headroom(skb) < headroom) {
-		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
-		if (skb2 == NULL) {
-			net_crit_ratelimited("dn_short_output: no memory\n");
-			kfree_skb(skb);
-			return -ENOBUFS;
-		}
-		consume_skb(skb);
-		skb = skb2;
-		net_info_ratelimited("dn_short_output: Increasing headroom\n");
-	}
-
-	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
-	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
-	sp = (struct dn_short_packet *)(data+2);
-
-	sp->msgflg     = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
-	sp->dstnode    = cb->dst;
-	sp->srcnode    = cb->src;
-	sp->forward    = cb->hops & 0x3f;
-
-	skb_reset_network_header(skb);
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
-		       &init_net, sk, skb, NULL, neigh->dev,
-		       dn_neigh_output_packet);
-}
-
-/*
- * For talking to DECnet phase III nodes
- * Phase 3 output is the same as short output, execpt that
- * it clears the area bits before transmission.
- */
-static int dn_phase3_output(struct neighbour *neigh, struct sock *sk,
-			    struct sk_buff *skb)
-{
-	struct net_device *dev = neigh->dev;
-	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
-	struct dn_short_packet *sp;
-	unsigned char *data;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-	if (skb_headroom(skb) < headroom) {
-		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
-		if (skb2 == NULL) {
-			net_crit_ratelimited("dn_phase3_output: no memory\n");
-			kfree_skb(skb);
-			return -ENOBUFS;
-		}
-		consume_skb(skb);
-		skb = skb2;
-		net_info_ratelimited("dn_phase3_output: Increasing headroom\n");
-	}
-
-	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
-	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
-	sp = (struct dn_short_packet *)(data + 2);
-
-	sp->msgflg   = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
-	sp->dstnode  = cb->dst & cpu_to_le16(0x03ff);
-	sp->srcnode  = cb->src & cpu_to_le16(0x03ff);
-	sp->forward  = cb->hops & 0x3f;
-
-	skb_reset_network_header(skb);
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
-		       &init_net, sk, skb, NULL, neigh->dev,
-		       dn_neigh_output_packet);
-}
-
-int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_route *rt = (struct dn_route *) dst;
-	struct neighbour *neigh = rt->n;
-	struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
-	struct dn_dev *dn_db;
-	bool use_long;
-
-	rcu_read_lock();
-	dn_db = rcu_dereference(neigh->dev->dn_ptr);
-	if (dn_db == NULL) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-	use_long = dn_db->use_long;
-	rcu_read_unlock();
-
-	if (dn->flags & DN_NDFLAG_P3)
-		return dn_phase3_output(neigh, sk, skb);
-	if (use_long)
-		return dn_long_output(neigh, sk, skb);
-	else
-		return dn_short_output(neigh, sk, skb);
-}
-
-/*
- * Unfortunately, the neighbour code uses the device in its hash
- * function, so we don't get any advantage from it. This function
- * basically does a neigh_lookup(), but without comparing the device
- * field. This is required for the On-Ethernet cache
- */
-
-/*
- * Pointopoint link receives a hello message
- */
-void dn_neigh_pointopoint_hello(struct sk_buff *skb)
-{
-	kfree_skb(skb);
-}
-
-/*
- * Ethernet router hello message received
- */
-int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
-
-	struct neighbour *neigh;
-	struct dn_neigh *dn;
-	struct dn_dev *dn_db;
-	__le16 src;
-
-	src = dn_eth2dn(msg->id);
-
-	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
-	dn = container_of(neigh, struct dn_neigh, n);
-
-	if (neigh) {
-		write_lock(&neigh->lock);
-
-		neigh->used = jiffies;
-		dn_db = rcu_dereference(neigh->dev->dn_ptr);
-
-		if (!(neigh->nud_state & NUD_PERMANENT)) {
-			neigh->updated = jiffies;
-
-			if (neigh->dev->type == ARPHRD_ETHER)
-				memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
-
-			dn->blksize  = le16_to_cpu(msg->blksize);
-			dn->priority = msg->priority;
-
-			dn->flags &= ~DN_NDFLAG_P3;
-
-			switch (msg->iinfo & DN_RT_INFO_TYPE) {
-			case DN_RT_INFO_L1RT:
-				dn->flags &=~DN_NDFLAG_R2;
-				dn->flags |= DN_NDFLAG_R1;
-				break;
-			case DN_RT_INFO_L2RT:
-				dn->flags |= DN_NDFLAG_R2;
-			}
-		}
-
-		/* Only use routers in our area */
-		if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) {
-			if (!dn_db->router) {
-				dn_db->router = neigh_clone(neigh);
-			} else {
-				if (msg->priority > container_of(dn_db->router,
-								 struct dn_neigh, n)->priority)
-					neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
-			}
-		}
-		write_unlock(&neigh->lock);
-		neigh_release(neigh);
-	}
-
-	kfree_skb(skb);
-	return 0;
-}
-
-/*
- * Endnode hello message received
- */
-int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
-	struct neighbour *neigh;
-	struct dn_neigh *dn;
-	__le16 src;
-
-	src = dn_eth2dn(msg->id);
-
-	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
-	dn = container_of(neigh, struct dn_neigh, n);
-
-	if (neigh) {
-		write_lock(&neigh->lock);
-
-		neigh->used = jiffies;
-
-		if (!(neigh->nud_state & NUD_PERMANENT)) {
-			neigh->updated = jiffies;
-
-			if (neigh->dev->type == ARPHRD_ETHER)
-				memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
-			dn->flags   &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2);
-			dn->blksize  = le16_to_cpu(msg->blksize);
-			dn->priority = 0;
-		}
-
-		write_unlock(&neigh->lock);
-		neigh_release(neigh);
-	}
-
-	kfree_skb(skb);
-	return 0;
-}
-
-static char *dn_find_slot(char *base, int max, int priority)
-{
-	int i;
-	unsigned char *min = NULL;
-
-	base += 6; /* skip first id */
-
-	for(i = 0; i < max; i++) {
-		if (!min || (*base < *min))
-			min = base;
-		base += 7; /* find next priority */
-	}
-
-	if (!min)
-		return NULL;
-
-	return (*min < priority) ? (min - 6) : NULL;
-}
-
-struct elist_cb_state {
-	struct net_device *dev;
-	unsigned char *ptr;
-	unsigned char *rs;
-	int t, n;
-};
-
-static void neigh_elist_cb(struct neighbour *neigh, void *_info)
-{
-	struct elist_cb_state *s = _info;
-	struct dn_neigh *dn;
-
-	if (neigh->dev != s->dev)
-		return;
-
-	dn = container_of(neigh, struct dn_neigh, n);
-	if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
-		return;
-
-	if (s->t == s->n)
-		s->rs = dn_find_slot(s->ptr, s->n, dn->priority);
-	else
-		s->t++;
-	if (s->rs == NULL)
-		return;
-
-	dn_dn2eth(s->rs, dn->addr);
-	s->rs += 6;
-	*(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0;
-	*(s->rs) |= dn->priority;
-	s->rs++;
-}
-
-int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
-{
-	struct elist_cb_state state;
-
-	state.dev = dev;
-	state.t = 0;
-	state.n = n;
-	state.ptr = ptr;
-	state.rs = ptr;
-
-	neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state);
-
-	return state.t;
-}
-
-
-#ifdef CONFIG_PROC_FS
-
-static inline void dn_neigh_format_entry(struct seq_file *seq,
-					 struct neighbour *n)
-{
-	struct dn_neigh *dn = container_of(n, struct dn_neigh, n);
-	char buf[DN_ASCBUF_LEN];
-
-	read_lock(&n->lock);
-	seq_printf(seq, "%-7s %s%s%s   %02x    %02d  %07ld %-8s\n",
-		   dn_addr2asc(le16_to_cpu(dn->addr), buf),
-		   (dn->flags&DN_NDFLAG_R1) ? "1" : "-",
-		   (dn->flags&DN_NDFLAG_R2) ? "2" : "-",
-		   (dn->flags&DN_NDFLAG_P3) ? "3" : "-",
-		   dn->n.nud_state,
-		   refcount_read(&dn->n.refcnt),
-		   dn->blksize,
-		   (dn->n.dev) ? dn->n.dev->name : "?");
-	read_unlock(&n->lock);
-}
-
-static int dn_neigh_seq_show(struct seq_file *seq, void *v)
-{
-	if (v == SEQ_START_TOKEN) {
-		seq_puts(seq, "Addr    Flags State Use Blksize Dev\n");
-	} else {
-		dn_neigh_format_entry(seq, v);
-	}
-
-	return 0;
-}
-
-static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	return neigh_seq_start(seq, pos, &dn_neigh_table,
-			       NEIGH_SEQ_NEIGH_ONLY);
-}
-
-static const struct seq_operations dn_neigh_seq_ops = {
-	.start = dn_neigh_seq_start,
-	.next  = neigh_seq_next,
-	.stop  = neigh_seq_stop,
-	.show  = dn_neigh_seq_show,
-};
-#endif
-
-void __init dn_neigh_init(void)
-{
-	neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
-	proc_create_net("decnet_neigh", 0444, init_net.proc_net,
-			&dn_neigh_seq_ops, sizeof(struct neigh_seq_state));
-}
-
-void __exit dn_neigh_cleanup(void)
-{
-	remove_proc_entry("decnet_neigh", init_net.proc_net);
-	neigh_table_clear(NEIGH_DN_TABLE, &dn_neigh_table);
-}
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
deleted file mode 100644
index c59be5b04479..000000000000
--- a/net/decnet/dn_nsp_in.c
+++ /dev/null
@@ -1,907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Network Services Protocol (Input)
- *
- * Author:      Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- *    Steve Whitehouse:  Split into dn_nsp_in.c and dn_nsp_out.c from
- *                       original dn_nsp.c.
- *    Steve Whitehouse:  Updated to work with my new routing architecture.
- *    Steve Whitehouse:  Add changes from Eduardo Serrat's patches.
- *    Steve Whitehouse:  Put all ack handling code in a common routine.
- *    Steve Whitehouse:  Put other common bits into dn_nsp_rx()
- *    Steve Whitehouse:  More checks on skb->len to catch bogus packets
- *                       Fixed various race conditions and possible nasties.
- *    Steve Whitehouse:  Now handles returned conninit frames.
- *     David S. Miller:  New socket locking
- *    Steve Whitehouse:  Fixed lockup when socket filtering was enabled.
- *         Paul Koning:  Fix to push CC sockets into RUN when acks are
- *                       received.
- *    Steve Whitehouse:
- *   Patrick Caulfield:  Checking conninits for correctness & sending of error
- *                       responses.
- *    Steve Whitehouse:  Added backlog congestion level return codes.
- *   Patrick Caulfield:
- *    Steve Whitehouse:  Added flow control support (outbound)
- *    Steve Whitehouse:  Prepare for nonlinear skbs
- */
-
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/filter.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/netfilter_decnet.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-extern int decnet_log_martians;
-
-static void dn_log_martian(struct sk_buff *skb, const char *msg)
-{
-	if (decnet_log_martians) {
-		char *devname = skb->dev ? skb->dev->name : "???";
-		struct dn_skb_cb *cb = DN_SKB_CB(skb);
-		net_info_ratelimited("DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n",
-				     msg, devname,
-				     le16_to_cpu(cb->src),
-				     le16_to_cpu(cb->dst),
-				     le16_to_cpu(cb->src_port),
-				     le16_to_cpu(cb->dst_port));
-	}
-}
-
-/*
- * For this function we've flipped the cross-subchannel bit
- * if the message is an otherdata or linkservice message. Thus
- * we can use it to work out what to update.
- */
-static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned short type = ((ack >> 12) & 0x0003);
-	int wakeup = 0;
-
-	switch (type) {
-	case 0: /* ACK - Data */
-		if (dn_after(ack, scp->ackrcv_dat)) {
-			scp->ackrcv_dat = ack & 0x0fff;
-			wakeup |= dn_nsp_check_xmit_queue(sk, skb,
-							  &scp->data_xmit_queue,
-							  ack);
-		}
-		break;
-	case 1: /* NAK - Data */
-		break;
-	case 2: /* ACK - OtherData */
-		if (dn_after(ack, scp->ackrcv_oth)) {
-			scp->ackrcv_oth = ack & 0x0fff;
-			wakeup |= dn_nsp_check_xmit_queue(sk, skb,
-							  &scp->other_xmit_queue,
-							  ack);
-		}
-		break;
-	case 3: /* NAK - OtherData */
-		break;
-	}
-
-	if (wakeup && !sock_flag(sk, SOCK_DEAD))
-		sk->sk_state_change(sk);
-}
-
-/*
- * This function is a universal ack processor.
- */
-static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
-{
-	__le16 *ptr = (__le16 *)skb->data;
-	int len = 0;
-	unsigned short ack;
-
-	if (skb->len < 2)
-		return len;
-
-	if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
-		skb_pull(skb, 2);
-		ptr++;
-		len += 2;
-		if ((ack & 0x4000) == 0) {
-			if (oth)
-				ack ^= 0x2000;
-			dn_ack(sk, skb, ack);
-		}
-	}
-
-	if (skb->len < 2)
-		return len;
-
-	if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
-		skb_pull(skb, 2);
-		len += 2;
-		if ((ack & 0x4000) == 0) {
-			if (oth)
-				ack ^= 0x2000;
-			dn_ack(sk, skb, ack);
-		}
-	}
-
-	return len;
-}
-
-
-/**
- * dn_check_idf - Check an image data field format is correct.
- * @pptr: Pointer to pointer to image data
- * @len: Pointer to length of image data
- * @max: The maximum allowed length of the data in the image data field
- * @follow_on: Check that this many bytes exist beyond the end of the image data
- *
- * Returns: 0 if ok, -1 on error
- */
-static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on)
-{
-	unsigned char *ptr = *pptr;
-	unsigned char flen = *ptr++;
-
-	(*len)--;
-	if (flen > max)
-		return -1;
-	if ((flen + follow_on) > *len)
-		return -1;
-
-	*len -= flen;
-	*pptr = ptr + flen;
-	return 0;
-}
-
-/*
- * Table of reason codes to pass back to node which sent us a badly
- * formed message, plus text messages for the log. A zero entry in
- * the reason field means "don't reply" otherwise a disc init is sent with
- * the specified reason code.
- */
-static struct {
-	unsigned short reason;
-	const char *text;
-} ci_err_table[] = {
- { 0,             "CI: Truncated message" },
- { NSP_REASON_ID, "CI: Destination username error" },
- { NSP_REASON_ID, "CI: Destination username type" },
- { NSP_REASON_US, "CI: Source username error" },
- { 0,             "CI: Truncated at menuver" },
- { 0,             "CI: Truncated before access or user data" },
- { NSP_REASON_IO, "CI: Access data format error" },
- { NSP_REASON_IO, "CI: User data format error" }
-};
-
-/*
- * This function uses a slightly different lookup method
- * to find its sockets, since it searches on object name/number
- * rather than port numbers. Various tests are done to ensure that
- * the incoming data is in the correct format before it is queued to
- * a socket.
- */
-static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data;
-	struct sockaddr_dn dstaddr;
-	struct sockaddr_dn srcaddr;
-	unsigned char type = 0;
-	int dstlen;
-	int srclen;
-	unsigned char *ptr;
-	int len;
-	int err = 0;
-	unsigned char menuver;
-
-	memset(&dstaddr, 0, sizeof(struct sockaddr_dn));
-	memset(&srcaddr, 0, sizeof(struct sockaddr_dn));
-
-	/*
-	 * 1. Decode & remove message header
-	 */
-	cb->src_port = msg->srcaddr;
-	cb->dst_port = msg->dstaddr;
-	cb->services = msg->services;
-	cb->info     = msg->info;
-	cb->segsize  = le16_to_cpu(msg->segsize);
-
-	if (!pskb_may_pull(skb, sizeof(*msg)))
-		goto err_out;
-
-	skb_pull(skb, sizeof(*msg));
-
-	len = skb->len;
-	ptr = skb->data;
-
-	/*
-	 * 2. Check destination end username format
-	 */
-	dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type);
-	err++;
-	if (dstlen < 0)
-		goto err_out;
-
-	err++;
-	if (type > 1)
-		goto err_out;
-
-	len -= dstlen;
-	ptr += dstlen;
-
-	/*
-	 * 3. Check source end username format
-	 */
-	srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type);
-	err++;
-	if (srclen < 0)
-		goto err_out;
-
-	len -= srclen;
-	ptr += srclen;
-	err++;
-	if (len < 1)
-		goto err_out;
-
-	menuver = *ptr;
-	ptr++;
-	len--;
-
-	/*
-	 * 4. Check that optional data actually exists if menuver says it does
-	 */
-	err++;
-	if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1))
-		goto err_out;
-
-	/*
-	 * 5. Check optional access data format
-	 */
-	err++;
-	if (menuver & DN_MENUVER_ACC) {
-		if (dn_check_idf(&ptr, &len, 39, 1))
-			goto err_out;
-		if (dn_check_idf(&ptr, &len, 39, 1))
-			goto err_out;
-		if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0))
-			goto err_out;
-	}
-
-	/*
-	 * 6. Check optional user data format
-	 */
-	err++;
-	if (menuver & DN_MENUVER_USR) {
-		if (dn_check_idf(&ptr, &len, 16, 0))
-			goto err_out;
-	}
-
-	/*
-	 * 7. Look up socket based on destination end username
-	 */
-	return dn_sklist_find_listener(&dstaddr);
-err_out:
-	dn_log_martian(skb, ci_err_table[err].text);
-	*reason = ci_err_table[err].reason;
-	return NULL;
-}
-
-
-static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
-{
-	if (sk_acceptq_is_full(sk)) {
-		kfree_skb(skb);
-		return;
-	}
-
-	sk_acceptq_added(sk);
-	skb_queue_tail(&sk->sk_receive_queue, skb);
-	sk->sk_state_change(sk);
-}
-
-static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned char *ptr;
-
-	if (skb->len < 4)
-		goto out;
-
-	ptr = skb->data;
-	cb->services = *ptr++;
-	cb->info = *ptr++;
-	cb->segsize = le16_to_cpu(*(__le16 *)ptr);
-
-	if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
-		scp->persist = 0;
-		scp->addrrem = cb->src_port;
-		sk->sk_state = TCP_ESTABLISHED;
-		scp->state = DN_RUN;
-		scp->services_rem = cb->services;
-		scp->info_rem = cb->info;
-		scp->segsize_rem = cb->segsize;
-
-		if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
-			scp->max_window = decnet_no_fc_max_cwnd;
-
-		if (skb->len > 0) {
-			u16 dlen = *skb->data;
-			if ((dlen <= 16) && (dlen <= skb->len)) {
-				scp->conndata_in.opt_optl = cpu_to_le16(dlen);
-				skb_copy_from_linear_data_offset(skb, 1,
-					      scp->conndata_in.opt_data, dlen);
-			}
-		}
-		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
-		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_state_change(sk);
-	}
-
-out:
-	kfree_skb(skb);
-}
-
-static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->state == DN_CI) {
-		scp->state = DN_CD;
-		scp->persist = 0;
-	}
-
-	kfree_skb(skb);
-}
-
-static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned short reason;
-
-	if (skb->len < 2)
-		goto out;
-
-	reason = le16_to_cpu(*(__le16 *)skb->data);
-	skb_pull(skb, 2);
-
-	scp->discdata_in.opt_status = cpu_to_le16(reason);
-	scp->discdata_in.opt_optl   = 0;
-	memset(scp->discdata_in.opt_data, 0, 16);
-
-	if (skb->len > 0) {
-		u16 dlen = *skb->data;
-		if ((dlen <= 16) && (dlen <= skb->len)) {
-			scp->discdata_in.opt_optl = cpu_to_le16(dlen);
-			skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen);
-		}
-	}
-
-	scp->addrrem = cb->src_port;
-	sk->sk_state = TCP_CLOSE;
-
-	switch (scp->state) {
-	case DN_CI:
-	case DN_CD:
-		scp->state = DN_RJ;
-		sk->sk_err = ECONNREFUSED;
-		break;
-	case DN_RUN:
-		sk->sk_shutdown |= SHUTDOWN_MASK;
-		scp->state = DN_DN;
-		break;
-	case DN_DI:
-		scp->state = DN_DIC;
-		break;
-	}
-
-	if (!sock_flag(sk, SOCK_DEAD)) {
-		if (sk->sk_socket->state != SS_UNCONNECTED)
-			sk->sk_socket->state = SS_DISCONNECTING;
-		sk->sk_state_change(sk);
-	}
-
-	/*
-	 * It appears that its possible for remote machines to send disc
-	 * init messages with no port identifier if we are in the CI and
-	 * possibly also the CD state. Obviously we shouldn't reply with
-	 * a message if we don't know what the end point is.
-	 */
-	if (scp->addrrem) {
-		dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
-	}
-	scp->persist_fxn = dn_destroy_timer;
-	scp->persist = dn_nsp_persist(sk);
-
-out:
-	kfree_skb(skb);
-}
-
-/*
- * disc_conf messages are also called no_resources or no_link
- * messages depending upon the "reason" field.
- */
-static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned short reason;
-
-	if (skb->len != 2)
-		goto out;
-
-	reason = le16_to_cpu(*(__le16 *)skb->data);
-
-	sk->sk_state = TCP_CLOSE;
-
-	switch (scp->state) {
-	case DN_CI:
-		scp->state = DN_NR;
-		break;
-	case DN_DR:
-		if (reason == NSP_REASON_DC)
-			scp->state = DN_DRC;
-		if (reason == NSP_REASON_NL)
-			scp->state = DN_CN;
-		break;
-	case DN_DI:
-		scp->state = DN_DIC;
-		break;
-	case DN_RUN:
-		sk->sk_shutdown |= SHUTDOWN_MASK;
-		fallthrough;
-	case DN_CC:
-		scp->state = DN_CN;
-	}
-
-	if (!sock_flag(sk, SOCK_DEAD)) {
-		if (sk->sk_socket->state != SS_UNCONNECTED)
-			sk->sk_socket->state = SS_DISCONNECTING;
-		sk->sk_state_change(sk);
-	}
-
-	scp->persist_fxn = dn_destroy_timer;
-	scp->persist = dn_nsp_persist(sk);
-
-out:
-	kfree_skb(skb);
-}
-
-static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned short segnum;
-	unsigned char lsflags;
-	signed char fcval;
-	int wake_up = 0;
-	char *ptr = skb->data;
-	unsigned char fctype = scp->services_rem & NSP_FC_MASK;
-
-	if (skb->len != 4)
-		goto out;
-
-	segnum = le16_to_cpu(*(__le16 *)ptr);
-	ptr += 2;
-	lsflags = *(unsigned char *)ptr++;
-	fcval = *ptr;
-
-	/*
-	 * Here we ignore erroneous packets which should really
-	 * should cause a connection abort. It is not critical
-	 * for now though.
-	 */
-	if (lsflags & 0xf8)
-		goto out;
-
-	if (seq_next(scp->numoth_rcv, segnum)) {
-		seq_add(&scp->numoth_rcv, 1);
-		switch(lsflags & 0x04) { /* FCVAL INT */
-		case 0x00: /* Normal Request */
-			switch(lsflags & 0x03) { /* FCVAL MOD */
-			case 0x00: /* Request count */
-				if (fcval < 0) {
-					unsigned char p_fcval = -fcval;
-					if ((scp->flowrem_dat > p_fcval) &&
-					    (fctype == NSP_FC_SCMC)) {
-						scp->flowrem_dat -= p_fcval;
-					}
-				} else if (fcval > 0) {
-					scp->flowrem_dat += fcval;
-					wake_up = 1;
-				}
-				break;
-			case 0x01: /* Stop outgoing data */
-				scp->flowrem_sw = DN_DONTSEND;
-				break;
-			case 0x02: /* Ok to start again */
-				scp->flowrem_sw = DN_SEND;
-				dn_nsp_output(sk);
-				wake_up = 1;
-			}
-			break;
-		case 0x04: /* Interrupt Request */
-			if (fcval > 0) {
-				scp->flowrem_oth += fcval;
-				wake_up = 1;
-			}
-			break;
-		}
-		if (wake_up && !sock_flag(sk, SOCK_DEAD))
-			sk->sk_state_change(sk);
-	}
-
-	dn_nsp_send_oth_ack(sk);
-
-out:
-	kfree_skb(skb);
-}
-
-/*
- * Copy of sock_queue_rcv_skb (from sock.h) without
- * bh_lock_sock() (its already held when this is called) which
- * also allows data and other data to be queued to a socket.
- */
-static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
-{
-	int err;
-
-	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
-	   number of warnings when compiling with -W --ANK
-	 */
-	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-	    (unsigned int)sk->sk_rcvbuf) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = sk_filter(sk, skb);
-	if (err)
-		goto out;
-
-	skb_set_owner_r(skb, sk);
-	skb_queue_tail(queue, skb);
-
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_data_ready(sk);
-out:
-	return err;
-}
-
-static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned short segnum;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	int queued = 0;
-
-	if (skb->len < 2)
-		goto out;
-
-	cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
-	skb_pull(skb, 2);
-
-	if (seq_next(scp->numoth_rcv, segnum)) {
-
-		if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) {
-			seq_add(&scp->numoth_rcv, 1);
-			scp->other_report = 0;
-			queued = 1;
-		}
-	}
-
-	dn_nsp_send_oth_ack(sk);
-out:
-	if (!queued)
-		kfree_skb(skb);
-}
-
-static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
-{
-	int queued = 0;
-	unsigned short segnum;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (skb->len < 2)
-		goto out;
-
-	cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
-	skb_pull(skb, 2);
-
-	if (seq_next(scp->numdat_rcv, segnum)) {
-		if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) {
-			seq_add(&scp->numdat_rcv, 1);
-			queued = 1;
-		}
-
-		if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) {
-			scp->flowloc_sw = DN_DONTSEND;
-			dn_nsp_send_link(sk, DN_DONTSEND, 0);
-		}
-	}
-
-	dn_nsp_send_data_ack(sk);
-out:
-	if (!queued)
-		kfree_skb(skb);
-}
-
-/*
- * If one of our conninit messages is returned, this function
- * deals with it. It puts the socket into the NO_COMMUNICATION
- * state.
- */
-static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->state == DN_CI) {
-		scp->state = DN_NC;
-		sk->sk_state = TCP_CLOSE;
-		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_state_change(sk);
-	}
-
-	kfree_skb(skb);
-}
-
-static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	int ret = NET_RX_DROP;
-
-	/* Must not reply to returned packets */
-	if (cb->rt_flags & DN_RT_F_RTS)
-		goto out;
-
-	if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) {
-		switch (cb->nsp_flags & 0x70) {
-		case 0x10:
-		case 0x60: /* (Retransmitted) Connect Init */
-			dn_nsp_return_disc(skb, NSP_DISCINIT, reason);
-			ret = NET_RX_SUCCESS;
-			break;
-		case 0x20: /* Connect Confirm */
-			dn_nsp_return_disc(skb, NSP_DISCCONF, reason);
-			ret = NET_RX_SUCCESS;
-			break;
-		}
-	}
-
-out:
-	kfree_skb(skb);
-	return ret;
-}
-
-static int dn_nsp_rx_packet(struct net *net, struct sock *sk2,
-			    struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct sock *sk = NULL;
-	unsigned char *ptr = (unsigned char *)skb->data;
-	unsigned short reason = NSP_REASON_NL;
-
-	if (!pskb_may_pull(skb, 2))
-		goto free_out;
-
-	skb_reset_transport_header(skb);
-	cb->nsp_flags = *ptr++;
-
-	if (decnet_debug_level & 2)
-		printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags);
-
-	if (cb->nsp_flags & 0x83)
-		goto free_out;
-
-	/*
-	 * Filter out conninits and useless packet types
-	 */
-	if ((cb->nsp_flags & 0x0c) == 0x08) {
-		switch (cb->nsp_flags & 0x70) {
-		case 0x00: /* NOP */
-		case 0x70: /* Reserved */
-		case 0x50: /* Reserved, Phase II node init */
-			goto free_out;
-		case 0x10:
-		case 0x60:
-			if (unlikely(cb->rt_flags & DN_RT_F_RTS))
-				goto free_out;
-			sk = dn_find_listener(skb, &reason);
-			goto got_it;
-		}
-	}
-
-	if (!pskb_may_pull(skb, 3))
-		goto free_out;
-
-	/*
-	 * Grab the destination address.
-	 */
-	cb->dst_port = *(__le16 *)ptr;
-	cb->src_port = 0;
-	ptr += 2;
-
-	/*
-	 * If not a connack, grab the source address too.
-	 */
-	if (pskb_may_pull(skb, 5)) {
-		cb->src_port = *(__le16 *)ptr;
-		ptr += 2;
-		skb_pull(skb, 5);
-	}
-
-	/*
-	 * Returned packets...
-	 * Swap src & dst and look up in the normal way.
-	 */
-	if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
-		swap(cb->dst_port, cb->src_port);
-		swap(cb->dst, cb->src);
-	}
-
-	/*
-	 * Find the socket to which this skb is destined.
-	 */
-	sk = dn_find_by_skb(skb);
-got_it:
-	if (sk != NULL) {
-		struct dn_scp *scp = DN_SK(sk);
-
-		/* Reset backoff */
-		scp->nsp_rxtshift = 0;
-
-		/*
-		 * We linearize everything except data segments here.
-		 */
-		if (cb->nsp_flags & ~0x60) {
-			if (unlikely(skb_linearize(skb)))
-				goto free_out;
-		}
-
-		return sk_receive_skb(sk, skb, 0);
-	}
-
-	return dn_nsp_no_socket(skb, reason);
-
-free_out:
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-int dn_nsp_rx(struct sk_buff *skb)
-{
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN,
-		       &init_net, NULL, skb, skb->dev, NULL,
-		       dn_nsp_rx_packet);
-}
-
-/*
- * This is the main receive routine for sockets. It is called
- * from the above when the socket is not busy, and also from
- * sock_release() when there is a backlog queued up.
- */
-int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-	if (cb->rt_flags & DN_RT_F_RTS) {
-		if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68)
-			dn_returned_conn_init(sk, skb);
-		else
-			kfree_skb(skb);
-		return NET_RX_SUCCESS;
-	}
-
-	/*
-	 * Control packet.
-	 */
-	if ((cb->nsp_flags & 0x0c) == 0x08) {
-		switch (cb->nsp_flags & 0x70) {
-		case 0x10:
-		case 0x60:
-			dn_nsp_conn_init(sk, skb);
-			break;
-		case 0x20:
-			dn_nsp_conn_conf(sk, skb);
-			break;
-		case 0x30:
-			dn_nsp_disc_init(sk, skb);
-			break;
-		case 0x40:
-			dn_nsp_disc_conf(sk, skb);
-			break;
-		}
-
-	} else if (cb->nsp_flags == 0x24) {
-		/*
-		 * Special for connacks, 'cos they don't have
-		 * ack data or ack otherdata info.
-		 */
-		dn_nsp_conn_ack(sk, skb);
-	} else {
-		int other = 1;
-
-		/* both data and ack frames can kick a CC socket into RUN */
-		if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) {
-			scp->state = DN_RUN;
-			sk->sk_state = TCP_ESTABLISHED;
-			sk->sk_state_change(sk);
-		}
-
-		if ((cb->nsp_flags & 0x1c) == 0)
-			other = 0;
-		if (cb->nsp_flags == 0x04)
-			other = 0;
-
-		/*
-		 * Read out ack data here, this applies equally
-		 * to data, other data, link service and both
-		 * ack data and ack otherdata.
-		 */
-		dn_process_ack(sk, skb, other);
-
-		/*
-		 * If we've some sort of data here then call a
-		 * suitable routine for dealing with it, otherwise
-		 * the packet is an ack and can be discarded.
-		 */
-		if ((cb->nsp_flags & 0x0c) == 0) {
-
-			if (scp->state != DN_RUN)
-				goto free_out;
-
-			switch (cb->nsp_flags) {
-			case 0x10: /* LS */
-				dn_nsp_linkservice(sk, skb);
-				break;
-			case 0x30: /* OD */
-				dn_nsp_otherdata(sk, skb);
-				break;
-			default:
-				dn_nsp_data(sk, skb);
-			}
-
-		} else { /* Ack, chuck it out here */
-free_out:
-			kfree_skb(skb);
-		}
-	}
-
-	return NET_RX_SUCCESS;
-}
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
deleted file mode 100644
index b05639bdfc8f..000000000000
--- a/net/decnet/dn_nsp_out.c
+++ /dev/null
@@ -1,696 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Network Services Protocol (Output)
- *
- * Author:      Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- *    Steve Whitehouse:  Split into dn_nsp_in.c and dn_nsp_out.c from
- *                       original dn_nsp.c.
- *    Steve Whitehouse:  Updated to work with my new routing architecture.
- *    Steve Whitehouse:  Added changes from Eduardo Serrat's patches.
- *    Steve Whitehouse:  Now conninits have the "return" bit set.
- *    Steve Whitehouse:  Fixes to check alloc'd skbs are non NULL!
- *                       Moved output state machine into one function
- *    Steve Whitehouse:  New output state machine
- *         Paul Koning:  Connect Confirm message fix.
- *      Eduardo Serrat:  Fix to stop dn_nsp_do_disc() sending malformed packets.
- *    Steve Whitehouse:  dn_nsp_output() and friends needed a spring clean
- *    Steve Whitehouse:  Moved dn_nsp_send() in here from route.h
- */
-
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/if_packet.h>
-#include <linux/jiffies.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
-
-static void dn_nsp_send(struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-	struct dn_scp *scp = DN_SK(sk);
-	struct dst_entry *dst;
-	struct flowidn fld;
-
-	skb_reset_transport_header(skb);
-	scp->stamp = jiffies;
-
-	dst = sk_dst_check(sk, 0);
-	if (dst) {
-try_again:
-		skb_dst_set(skb, dst);
-		dst_output(&init_net, skb->sk, skb);
-		return;
-	}
-
-	memset(&fld, 0, sizeof(fld));
-	fld.flowidn_oif = sk->sk_bound_dev_if;
-	fld.saddr = dn_saddr2dn(&scp->addr);
-	fld.daddr = dn_saddr2dn(&scp->peer);
-	dn_sk_ports_copy(&fld, scp);
-	fld.flowidn_proto = DNPROTO_NSP;
-	if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) {
-		dst = sk_dst_get(sk);
-		sk->sk_route_caps = dst->dev->features;
-		goto try_again;
-	}
-
-	sk->sk_err = EHOSTUNREACH;
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_state_change(sk);
-}
-
-
-/*
- * If sk == NULL, then we assume that we are supposed to be making
- * a routing layer skb. If sk != NULL, then we are supposed to be
- * creating an skb for the NSP layer.
- *
- * The eventual aim is for each socket to have a cached header size
- * for its outgoing packets, and to set hdr from this when sk != NULL.
- */
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
-{
-	struct sk_buff *skb;
-	int hdr = 64;
-
-	if ((skb = alloc_skb(size + hdr, pri)) == NULL)
-		return NULL;
-
-	skb->protocol = htons(ETH_P_DNA_RT);
-	skb->pkt_type = PACKET_OUTGOING;
-
-	if (sk)
-		skb_set_owner_w(skb, sk);
-
-	skb_reserve(skb, hdr);
-
-	return skb;
-}
-
-/*
- * Calculate persist timer based upon the smoothed round
- * trip time and the variance. Backoff according to the
- * nsp_backoff[] array.
- */
-unsigned long dn_nsp_persist(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
-	t *= nsp_backoff[scp->nsp_rxtshift];
-
-	if (t < HZ) t = HZ;
-	if (t > (600*HZ)) t = (600*HZ);
-
-	if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT)
-		scp->nsp_rxtshift++;
-
-	/* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */
-
-	return t;
-}
-
-/*
- * This is called each time we get an estimate for the rtt
- * on the link.
- */
-static void dn_nsp_rtt(struct sock *sk, long rtt)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	long srtt = (long)scp->nsp_srtt;
-	long rttvar = (long)scp->nsp_rttvar;
-	long delta;
-
-	/*
-	 * If the jiffies clock flips over in the middle of timestamp
-	 * gathering this value might turn out negative, so we make sure
-	 * that is it always positive here.
-	 */
-	if (rtt < 0)
-		rtt = -rtt;
-	/*
-	 * Add new rtt to smoothed average
-	 */
-	delta = ((rtt << 3) - srtt);
-	srtt += (delta >> 3);
-	if (srtt >= 1)
-		scp->nsp_srtt = (unsigned long)srtt;
-	else
-		scp->nsp_srtt = 1;
-
-	/*
-	 * Add new rtt variance to smoothed varience
-	 */
-	delta >>= 1;
-	rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2);
-	if (rttvar >= 1)
-		scp->nsp_rttvar = (unsigned long)rttvar;
-	else
-		scp->nsp_rttvar = 1;
-
-	/* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */
-}
-
-/**
- * dn_nsp_clone_and_send - Send a data packet by cloning it
- * @skb: The packet to clone and transmit
- * @gfp: memory allocation flag
- *
- * Clone a queued data or other data packet and transmit it.
- *
- * Returns: The number of times the packet has been sent previously
- */
-static inline unsigned int dn_nsp_clone_and_send(struct sk_buff *skb,
-					     gfp_t gfp)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct sk_buff *skb2;
-	int ret = 0;
-
-	if ((skb2 = skb_clone(skb, gfp)) != NULL) {
-		ret = cb->xmit_count;
-		cb->xmit_count++;
-		cb->stamp = jiffies;
-		skb2->sk = skb->sk;
-		dn_nsp_send(skb2);
-	}
-
-	return ret;
-}
-
-/**
- * dn_nsp_output - Try and send something from socket queues
- * @sk: The socket whose queues are to be investigated
- *
- * Try and send the packet on the end of the data and other data queues.
- * Other data gets priority over data, and if we retransmit a packet we
- * reduce the window by dividing it in two.
- *
- */
-void dn_nsp_output(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff *skb;
-	unsigned int reduce_win = 0;
-
-	/*
-	 * First we check for otherdata/linkservice messages
-	 */
-	if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL)
-		reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
-	/*
-	 * If we may not send any data, we don't.
-	 * If we are still trying to get some other data down the
-	 * channel, we don't try and send any data.
-	 */
-	if (reduce_win || (scp->flowrem_sw != DN_SEND))
-		goto recalc_window;
-
-	if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL)
-		reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
-	/*
-	 * If we've sent any frame more than once, we cut the
-	 * send window size in half. There is always a minimum
-	 * window size of one available.
-	 */
-recalc_window:
-	if (reduce_win) {
-		scp->snd_window >>= 1;
-		if (scp->snd_window < NSP_MIN_WINDOW)
-			scp->snd_window = NSP_MIN_WINDOW;
-	}
-}
-
-int dn_nsp_xmit_timeout(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	dn_nsp_output(sk);
-
-	if (!skb_queue_empty(&scp->data_xmit_queue) ||
-	    !skb_queue_empty(&scp->other_xmit_queue))
-		scp->persist = dn_nsp_persist(sk);
-
-	return 0;
-}
-
-static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
-{
-	unsigned char *ptr = skb_push(skb, len);
-
-	BUG_ON(len < 5);
-
-	*ptr++ = msgflag;
-	*((__le16 *)ptr) = scp->addrrem;
-	ptr += 2;
-	*((__le16 *)ptr) = scp->addrloc;
-	ptr += 2;
-	return (__le16 __force *)ptr;
-}
-
-static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	unsigned short acknum = scp->numdat_rcv & 0x0FFF;
-	unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
-	__le16 *ptr;
-
-	BUG_ON(hlen < 9);
-
-	scp->ackxmt_dat = acknum;
-	scp->ackxmt_oth = ackcrs;
-	acknum |= 0x8000;
-	ackcrs |= 0x8000;
-
-	/* If this is an "other data/ack" message, swap acknum and ackcrs */
-	if (other)
-		swap(acknum, ackcrs);
-
-	/* Set "cross subchannel" bit in ackcrs */
-	ackcrs |= 0x2000;
-
-	ptr = dn_mk_common_header(scp, skb, msgflag, hlen);
-
-	*ptr++ = cpu_to_le16(acknum);
-	*ptr++ = cpu_to_le16(ackcrs);
-
-	return ptr;
-}
-
-static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	__le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
-
-	if (unlikely(oth)) {
-		cb->segnum = scp->numoth;
-		seq_add(&scp->numoth, 1);
-	} else {
-		cb->segnum = scp->numdat;
-		seq_add(&scp->numdat, 1);
-	}
-	*(ptr++) = cpu_to_le16(cb->segnum);
-
-	return ptr;
-}
-
-void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
-			gfp_t gfp, int oth)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
-	cb->xmit_count = 0;
-	dn_nsp_mk_data_header(sk, skb, oth);
-
-	/*
-	 * Slow start: If we have been idle for more than
-	 * one RTT, then reset window to min size.
-	 */
-	if (time_is_before_jiffies(scp->stamp + t))
-		scp->snd_window = NSP_MIN_WINDOW;
-
-	if (oth)
-		skb_queue_tail(&scp->other_xmit_queue, skb);
-	else
-		skb_queue_tail(&scp->data_xmit_queue, skb);
-
-	if (scp->flowrem_sw != DN_SEND)
-		return;
-
-	dn_nsp_clone_and_send(skb, gfp);
-}
-
-
-int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff *skb2, *n, *ack = NULL;
-	int wakeup = 0;
-	int try_retrans = 0;
-	unsigned long reftime = cb->stamp;
-	unsigned long pkttime;
-	unsigned short xmit_count;
-	unsigned short segnum;
-
-	skb_queue_walk_safe(q, skb2, n) {
-		struct dn_skb_cb *cb2 = DN_SKB_CB(skb2);
-
-		if (dn_before_or_equal(cb2->segnum, acknum))
-			ack = skb2;
-
-		/* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */
-
-		if (ack == NULL)
-			continue;
-
-		/* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */
-
-		/* Does _last_ packet acked have xmit_count > 1 */
-		try_retrans = 0;
-		/* Remember to wake up the sending process */
-		wakeup = 1;
-		/* Keep various statistics */
-		pkttime = cb2->stamp;
-		xmit_count = cb2->xmit_count;
-		segnum = cb2->segnum;
-		/* Remove and drop ack'ed packet */
-		skb_unlink(ack, q);
-		kfree_skb(ack);
-		ack = NULL;
-
-		/*
-		 * We don't expect to see acknowledgements for packets we
-		 * haven't sent yet.
-		 */
-		WARN_ON(xmit_count == 0);
-
-		/*
-		 * If the packet has only been sent once, we can use it
-		 * to calculate the RTT and also open the window a little
-		 * further.
-		 */
-		if (xmit_count == 1) {
-			if (dn_equal(segnum, acknum))
-				dn_nsp_rtt(sk, (long)(pkttime - reftime));
-
-			if (scp->snd_window < scp->max_window)
-				scp->snd_window++;
-		}
-
-		/*
-		 * Packet has been sent more than once. If this is the last
-		 * packet to be acknowledged then we want to send the next
-		 * packet in the send queue again (assumes the remote host does
-		 * go-back-N error control).
-		 */
-		if (xmit_count > 1)
-			try_retrans = 1;
-	}
-
-	if (try_retrans)
-		dn_nsp_output(sk);
-
-	return wakeup;
-}
-
-void dn_nsp_send_data_ack(struct sock *sk)
-{
-	struct sk_buff *skb = NULL;
-
-	if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
-		return;
-
-	skb_reserve(skb, 9);
-	dn_mk_ack_header(sk, skb, 0x04, 9, 0);
-	dn_nsp_send(skb);
-}
-
-void dn_nsp_send_oth_ack(struct sock *sk)
-{
-	struct sk_buff *skb = NULL;
-
-	if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
-		return;
-
-	skb_reserve(skb, 9);
-	dn_mk_ack_header(sk, skb, 0x14, 9, 1);
-	dn_nsp_send(skb);
-}
-
-
-void dn_send_conn_ack (struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff *skb = NULL;
-	struct nsp_conn_ack_msg *msg;
-
-	if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL)
-		return;
-
-	msg = skb_put(skb, 3);
-	msg->msgflg = 0x24;
-	msg->dstaddr = scp->addrrem;
-
-	dn_nsp_send(skb);
-}
-
-static int dn_nsp_retrans_conn_conf(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->state == DN_CC)
-		dn_send_conn_conf(sk, GFP_ATOMIC);
-
-	return 0;
-}
-
-void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff *skb = NULL;
-	struct nsp_conn_init_msg *msg;
-	__u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
-
-	if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL)
-		return;
-
-	msg = skb_put(skb, sizeof(*msg));
-	msg->msgflg = 0x28;
-	msg->dstaddr = scp->addrrem;
-	msg->srcaddr = scp->addrloc;
-	msg->services = scp->services_loc;
-	msg->info = scp->info_loc;
-	msg->segsize = cpu_to_le16(scp->segsize_loc);
-
-	skb_put_u8(skb, len);
-
-	if (len > 0)
-		skb_put_data(skb, scp->conndata_out.opt_data, len);
-
-
-	dn_nsp_send(skb);
-
-	scp->persist = dn_nsp_persist(sk);
-	scp->persist_fxn = dn_nsp_retrans_conn_conf;
-}
-
-
-static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
-			unsigned short reason, gfp_t gfp,
-			struct dst_entry *dst,
-			int ddl, unsigned char *dd, __le16 rem, __le16 loc)
-{
-	struct sk_buff *skb = NULL;
-	int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
-	unsigned char *msg;
-
-	if ((dst == NULL) || (rem == 0)) {
-		net_dbg_ratelimited("DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n",
-				    le16_to_cpu(rem), dst);
-		return;
-	}
-
-	if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL)
-		return;
-
-	msg = skb_put(skb, size);
-	*msg++ = msgflg;
-	*(__le16 *)msg = rem;
-	msg += 2;
-	*(__le16 *)msg = loc;
-	msg += 2;
-	*(__le16 *)msg = cpu_to_le16(reason);
-	msg += 2;
-	if (msgflg == NSP_DISCINIT)
-		*msg++ = ddl;
-
-	if (ddl) {
-		memcpy(msg, dd, ddl);
-	}
-
-	/*
-	 * This doesn't go via the dn_nsp_send() function since we need
-	 * to be able to send disc packets out which have no socket
-	 * associations.
-	 */
-	skb_dst_set(skb, dst_clone(dst));
-	dst_output(&init_net, skb->sk, skb);
-}
-
-
-void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
-			unsigned short reason, gfp_t gfp)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	int ddl = 0;
-
-	if (msgflg == NSP_DISCINIT)
-		ddl = le16_to_cpu(scp->discdata_out.opt_optl);
-
-	if (reason == 0)
-		reason = le16_to_cpu(scp->discdata_out.opt_status);
-
-	dn_nsp_do_disc(sk, msgflg, reason, gfp, __sk_dst_get(sk), ddl,
-		scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
-}
-
-
-void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
-			unsigned short reason)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	int ddl = 0;
-	gfp_t gfp = GFP_ATOMIC;
-
-	dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl,
-			NULL, cb->src_port, cb->dst_port);
-}
-
-
-void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct sk_buff *skb;
-	unsigned char *ptr;
-	gfp_t gfp = GFP_ATOMIC;
-
-	if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
-		return;
-
-	skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
-	ptr = skb_put(skb, 2);
-	DN_SKB_CB(skb)->nsp_flags = 0x10;
-	*ptr++ = lsflags;
-	*ptr = fcval;
-
-	dn_nsp_queue_xmit(sk, skb, gfp, 1);
-
-	scp->persist = dn_nsp_persist(sk);
-	scp->persist_fxn = dn_nsp_xmit_timeout;
-}
-
-static int dn_nsp_retrans_conninit(struct sock *sk)
-{
-	struct dn_scp *scp = DN_SK(sk);
-
-	if (scp->state == DN_CI)
-		dn_nsp_send_conninit(sk, NSP_RCI);
-
-	return 0;
-}
-
-void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
-{
-	struct dn_scp *scp = DN_SK(sk);
-	struct nsp_conn_init_msg *msg;
-	unsigned char aux;
-	unsigned char menuver;
-	struct dn_skb_cb *cb;
-	unsigned char type = 1;
-	gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
-	struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
-
-	if (!skb)
-		return;
-
-	cb  = DN_SKB_CB(skb);
-	msg = skb_put(skb, sizeof(*msg));
-
-	msg->msgflg	= msgflg;
-	msg->dstaddr	= 0x0000;		/* Remote Node will assign it*/
-
-	msg->srcaddr	= scp->addrloc;
-	msg->services	= scp->services_loc;	/* Requested flow control    */
-	msg->info	= scp->info_loc;	/* Version Number            */
-	msg->segsize	= cpu_to_le16(scp->segsize_loc);	/* Max segment size  */
-
-	if (scp->peer.sdn_objnum)
-		type = 0;
-
-	skb_put(skb, dn_sockaddr2username(&scp->peer,
-					  skb_tail_pointer(skb), type));
-	skb_put(skb, dn_sockaddr2username(&scp->addr,
-					  skb_tail_pointer(skb), 2));
-
-	menuver = DN_MENUVER_ACC | DN_MENUVER_USR;
-	if (scp->peer.sdn_flags & SDF_PROXY)
-		menuver |= DN_MENUVER_PRX;
-	if (scp->peer.sdn_flags & SDF_UICPROXY)
-		menuver |= DN_MENUVER_UIC;
-
-	skb_put_u8(skb, menuver);	/* Menu Version		*/
-
-	aux = scp->accessdata.acc_userl;
-	skb_put_u8(skb, aux);
-	if (aux > 0)
-		skb_put_data(skb, scp->accessdata.acc_user, aux);
-
-	aux = scp->accessdata.acc_passl;
-	skb_put_u8(skb, aux);
-	if (aux > 0)
-		skb_put_data(skb, scp->accessdata.acc_pass, aux);
-
-	aux = scp->accessdata.acc_accl;
-	skb_put_u8(skb, aux);
-	if (aux > 0)
-		skb_put_data(skb, scp->accessdata.acc_acc, aux);
-
-	aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
-	skb_put_u8(skb, aux);
-	if (aux > 0)
-		skb_put_data(skb, scp->conndata_out.opt_data, aux);
-
-	scp->persist = dn_nsp_persist(sk);
-	scp->persist_fxn = dn_nsp_retrans_conninit;
-
-	cb->rt_flags = DN_RT_F_RQR;
-
-	dn_nsp_send(skb);
-}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
deleted file mode 100644
index ac2ee1689111..000000000000
--- a/net/decnet/dn_route.c
+++ /dev/null
@@ -1,1922 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Routing Functions (Endnode and Router)
- *
- * Authors:     Steve Whitehouse <SteveW@ACM.org>
- *              Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *              Steve Whitehouse : Fixes to allow "intra-ethernet" and
- *                                 "return-to-sender" bits on outgoing
- *                                 packets.
- *		Steve Whitehouse : Timeouts for cached routes.
- *              Steve Whitehouse : Use dst cache for input routes too.
- *              Steve Whitehouse : Fixed error values in dn_send_skb.
- *              Steve Whitehouse : Rework routing functions to better fit
- *                                 DECnet routing design
- *              Alexey Kuznetsov : New SMP locking
- *              Steve Whitehouse : More SMP locking changes & dn_cache_dump()
- *              Steve Whitehouse : Prerouting NF hook, now really is prerouting.
- *				   Fixed possible skb leak in rtnetlink funcs.
- *              Steve Whitehouse : Dave Miller's dynamic hash table sizing and
- *                                 Alexey Kuznetsov's finer grained locking
- *                                 from ipv4/route.c.
- *              Steve Whitehouse : Routing is now starting to look like a
- *                                 sensible set of code now, mainly due to
- *                                 my copying the IPv4 routing code. The
- *                                 hooks here are modified and will continue
- *                                 to evolve for a while.
- *              Steve Whitehouse : Real SMP at last :-) Also new netfilter
- *                                 stuff. Look out raw sockets your days
- *                                 are numbered!
- *              Steve Whitehouse : Added return-to-sender functions. Added
- *                                 backlog congestion level return codes.
- *		Steve Whitehouse : Fixed bug where routes were set up with
- *                                 no ref count on net devices.
- *              Steve Whitehouse : RCU for the route cache
- *              Steve Whitehouse : Preparations for the flow cache
- *              Steve Whitehouse : Prepare for nonlinear skbs
- */
-
-/******************************************************************************
-    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
-
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/in_route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <linux/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/rtnetlink.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/rcupdate.h>
-#include <linux/times.h>
-#include <linux/export.h>
-#include <asm/errno.h>
-#include <net/net_namespace.h>
-#include <net/netlink.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_nsp.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-struct dn_rt_hash_bucket {
-	struct dn_route __rcu *chain;
-	spinlock_t lock;
-};
-
-extern struct neigh_table dn_neigh_table;
-
-
-static unsigned char dn_hiord_addr[6] = {0xAA, 0x00, 0x04, 0x00, 0x00, 0x00};
-
-static const int dn_rt_min_delay = 2 * HZ;
-static const int dn_rt_max_delay = 10 * HZ;
-static const int dn_rt_mtu_expires = 10 * 60 * HZ;
-
-static unsigned long dn_rt_deadline;
-
-static int dn_dst_gc(struct dst_ops *ops);
-static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
-static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
-static unsigned int dn_dst_mtu(const struct dst_entry *dst);
-static void dn_dst_destroy(struct dst_entry *);
-static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
-static void dn_dst_link_failure(struct sk_buff *);
-static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			       struct sk_buff *skb , u32 mtu,
-			       bool confirm_neigh);
-static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
-			    struct sk_buff *skb);
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
-					     struct sk_buff *skb,
-					     const void *daddr);
-static int dn_route_input(struct sk_buff *);
-static void dn_run_flush(struct timer_list *unused);
-
-static struct dn_rt_hash_bucket *dn_rt_hash_table;
-static unsigned int dn_rt_hash_mask;
-
-static struct timer_list dn_route_timer;
-static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush);
-int decnet_dst_gc_interval = 2;
-
-static struct dst_ops dn_dst_ops = {
-	.family =		PF_DECnet,
-	.gc_thresh =		128,
-	.gc =			dn_dst_gc,
-	.check =		dn_dst_check,
-	.default_advmss =	dn_dst_default_advmss,
-	.mtu =			dn_dst_mtu,
-	.cow_metrics =		dst_cow_metrics_generic,
-	.destroy =		dn_dst_destroy,
-	.ifdown =		dn_dst_ifdown,
-	.negative_advice =	dn_dst_negative_advice,
-	.link_failure =		dn_dst_link_failure,
-	.update_pmtu =		dn_dst_update_pmtu,
-	.redirect =		dn_dst_redirect,
-	.neigh_lookup =		dn_dst_neigh_lookup,
-};
-
-static void dn_dst_destroy(struct dst_entry *dst)
-{
-	struct dn_route *rt = (struct dn_route *) dst;
-
-	if (rt->n)
-		neigh_release(rt->n);
-	dst_destroy_metrics_generic(dst);
-}
-
-static void dn_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how)
-{
-	if (how) {
-		struct dn_route *rt = (struct dn_route *) dst;
-		struct neighbour *n = rt->n;
-
-		if (n && n->dev == dev) {
-			n->dev = blackhole_netdev;
-			dev_hold(n->dev);
-			dev_put(dev);
-		}
-	}
-}
-
-static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
-{
-	__u16 tmp = (__u16 __force)(src ^ dst);
-	tmp ^= (tmp >> 3);
-	tmp ^= (tmp >> 5);
-	tmp ^= (tmp >> 10);
-	return dn_rt_hash_mask & (unsigned int)tmp;
-}
-
-static void dn_dst_check_expire(struct timer_list *unused)
-{
-	int i;
-	struct dn_route *rt;
-	struct dn_route __rcu **rtp;
-	unsigned long now = jiffies;
-	unsigned long expire = 120 * HZ;
-
-	for (i = 0; i <= dn_rt_hash_mask; i++) {
-		rtp = &dn_rt_hash_table[i].chain;
-
-		spin_lock(&dn_rt_hash_table[i].lock);
-		while ((rt = rcu_dereference_protected(*rtp,
-						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
-			if (atomic_read(&rt->dst.__refcnt) > 1 ||
-			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dn_next;
-				continue;
-			}
-			*rtp = rt->dn_next;
-			rt->dn_next = NULL;
-			dst_dev_put(&rt->dst);
-			dst_release(&rt->dst);
-		}
-		spin_unlock(&dn_rt_hash_table[i].lock);
-
-		if (jiffies != now)
-			break;
-	}
-
-	mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ);
-}
-
-static int dn_dst_gc(struct dst_ops *ops)
-{
-	struct dn_route *rt;
-	struct dn_route __rcu **rtp;
-	int i;
-	unsigned long now = jiffies;
-	unsigned long expire = 10 * HZ;
-
-	for (i = 0; i <= dn_rt_hash_mask; i++) {
-
-		spin_lock_bh(&dn_rt_hash_table[i].lock);
-		rtp = &dn_rt_hash_table[i].chain;
-
-		while ((rt = rcu_dereference_protected(*rtp,
-						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
-			if (atomic_read(&rt->dst.__refcnt) > 1 ||
-			    (now - rt->dst.lastuse) < expire) {
-				rtp = &rt->dn_next;
-				continue;
-			}
-			*rtp = rt->dn_next;
-			rt->dn_next = NULL;
-			dst_dev_put(&rt->dst);
-			dst_release(&rt->dst);
-			break;
-		}
-		spin_unlock_bh(&dn_rt_hash_table[i].lock);
-	}
-
-	return 0;
-}
-
-/*
- * The decnet standards don't impose a particular minimum mtu, what they
- * do insist on is that the routing layer accepts a datagram of at least
- * 230 bytes long. Here we have to subtract the routing header length from
- * 230 to get the minimum acceptable mtu. If there is no neighbour, then we
- * assume the worst and use a long header size.
- *
- * We update both the mtu and the advertised mss (i.e. the segment size we
- * advertise to the other end).
- */
-static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			       struct sk_buff *skb, u32 mtu,
-			       bool confirm_neigh)
-{
-	struct dn_route *rt = (struct dn_route *) dst;
-	struct neighbour *n = rt->n;
-	u32 min_mtu = 230;
-	struct dn_dev *dn;
-
-	dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL;
-
-	if (dn && dn->use_long == 0)
-		min_mtu -= 6;
-	else
-		min_mtu -= 21;
-
-	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
-		if (!(dst_metric_locked(dst, RTAX_MTU))) {
-			dst_metric_set(dst, RTAX_MTU, mtu);
-			dst_set_expires(dst, dn_rt_mtu_expires);
-		}
-		if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
-			u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
-			u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS);
-			if (!existing_mss || existing_mss > mss)
-				dst_metric_set(dst, RTAX_ADVMSS, mss);
-		}
-	}
-}
-
-static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
-			    struct sk_buff *skb)
-{
-}
-
-/*
- * When a route has been marked obsolete. (e.g. routing cache flush)
- */
-static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
-{
-	return NULL;
-}
-
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst)
-{
-	dst_release(dst);
-	return NULL;
-}
-
-static void dn_dst_link_failure(struct sk_buff *skb)
-{
-}
-
-static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2)
-{
-	return ((fl1->daddr ^ fl2->daddr) |
-		(fl1->saddr ^ fl2->saddr) |
-		(fl1->flowidn_mark ^ fl2->flowidn_mark) |
-		(fl1->flowidn_scope ^ fl2->flowidn_scope) |
-		(fl1->flowidn_oif ^ fl2->flowidn_oif) |
-		(fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0;
-}
-
-static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_route **rp)
-{
-	struct dn_route *rth;
-	struct dn_route __rcu **rthp;
-	unsigned long now = jiffies;
-
-	rthp = &dn_rt_hash_table[hash].chain;
-
-	spin_lock_bh(&dn_rt_hash_table[hash].lock);
-	while ((rth = rcu_dereference_protected(*rthp,
-						lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
-		if (compare_keys(&rth->fld, &rt->fld)) {
-			/* Put it first */
-			*rthp = rth->dn_next;
-			rcu_assign_pointer(rth->dn_next,
-					   dn_rt_hash_table[hash].chain);
-			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
-
-			dst_hold_and_use(&rth->dst, now);
-			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
-
-			dst_release_immediate(&rt->dst);
-			*rp = rth;
-			return 0;
-		}
-		rthp = &rth->dn_next;
-	}
-
-	rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain);
-	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
-
-	dst_hold_and_use(&rt->dst, now);
-	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
-	*rp = rt;
-	return 0;
-}
-
-static void dn_run_flush(struct timer_list *unused)
-{
-	int i;
-	struct dn_route *rt, *next;
-
-	for (i = 0; i < dn_rt_hash_mask; i++) {
-		spin_lock_bh(&dn_rt_hash_table[i].lock);
-
-		rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL);
-		if (!rt)
-			goto nothing_to_declare;
-
-		for (; rt; rt = next) {
-			next = rcu_dereference_raw(rt->dn_next);
-			RCU_INIT_POINTER(rt->dn_next, NULL);
-			dst_dev_put(&rt->dst);
-			dst_release(&rt->dst);
-		}
-
-nothing_to_declare:
-		spin_unlock_bh(&dn_rt_hash_table[i].lock);
-	}
-}
-
-static DEFINE_SPINLOCK(dn_rt_flush_lock);
-
-void dn_rt_cache_flush(int delay)
-{
-	unsigned long now = jiffies;
-	int user_mode = !in_interrupt();
-
-	if (delay < 0)
-		delay = dn_rt_min_delay;
-
-	spin_lock_bh(&dn_rt_flush_lock);
-
-	if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) {
-		long tmo = (long)(dn_rt_deadline - now);
-
-		if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay)
-			tmo = 0;
-
-		if (delay > tmo)
-			delay = tmo;
-	}
-
-	if (delay <= 0) {
-		spin_unlock_bh(&dn_rt_flush_lock);
-		dn_run_flush(NULL);
-		return;
-	}
-
-	if (dn_rt_deadline == 0)
-		dn_rt_deadline = now + dn_rt_max_delay;
-
-	dn_rt_flush_timer.expires = now + delay;
-	add_timer(&dn_rt_flush_timer);
-	spin_unlock_bh(&dn_rt_flush_lock);
-}
-
-/**
- * dn_return_short - Return a short packet to its sender
- * @skb: The packet to return
- *
- */
-static int dn_return_short(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb;
-	unsigned char *ptr;
-	__le16 *src;
-	__le16 *dst;
-
-	/* Add back headers */
-	skb_push(skb, skb->data - skb_network_header(skb));
-
-	skb = skb_unshare(skb, GFP_ATOMIC);
-	if (!skb)
-		return NET_RX_DROP;
-
-	cb = DN_SKB_CB(skb);
-	/* Skip packet length and point to flags */
-	ptr = skb->data + 2;
-	*ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
-
-	dst = (__le16 *)ptr;
-	ptr += 2;
-	src = (__le16 *)ptr;
-	ptr += 2;
-	*ptr = 0; /* Zero hop count */
-
-	swap(*src, *dst);
-
-	skb->pkt_type = PACKET_OUTGOING;
-	dn_rt_finish_output(skb, NULL, NULL);
-	return NET_RX_SUCCESS;
-}
-
-/**
- * dn_return_long - Return a long packet to its sender
- * @skb: The long format packet to return
- *
- */
-static int dn_return_long(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb;
-	unsigned char *ptr;
-	unsigned char *src_addr, *dst_addr;
-	unsigned char tmp[ETH_ALEN];
-
-	/* Add back all headers */
-	skb_push(skb, skb->data - skb_network_header(skb));
-
-	skb = skb_unshare(skb, GFP_ATOMIC);
-	if (!skb)
-		return NET_RX_DROP;
-
-	cb = DN_SKB_CB(skb);
-	/* Ignore packet length and point to flags */
-	ptr = skb->data + 2;
-
-	/* Skip padding */
-	if (*ptr & DN_RT_F_PF) {
-		char padlen = (*ptr & ~DN_RT_F_PF);
-		ptr += padlen;
-	}
-
-	*ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
-	ptr += 2;
-	dst_addr = ptr;
-	ptr += 8;
-	src_addr = ptr;
-	ptr += 6;
-	*ptr = 0; /* Zero hop count */
-
-	/* Swap source and destination */
-	memcpy(tmp, src_addr, ETH_ALEN);
-	memcpy(src_addr, dst_addr, ETH_ALEN);
-	memcpy(dst_addr, tmp, ETH_ALEN);
-
-	skb->pkt_type = PACKET_OUTGOING;
-	dn_rt_finish_output(skb, dst_addr, src_addr);
-	return NET_RX_SUCCESS;
-}
-
-/**
- * dn_route_rx_packet - Try and find a route for an incoming packet
- * @net: The applicable net namespace
- * @sk: Socket packet transmitted on
- * @skb: The packet to find a route for
- *
- * Returns: result of input function if route is found, error code otherwise
- */
-static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb;
-	int err;
-
-	err = dn_route_input(skb);
-	if (err == 0)
-		return dst_input(skb);
-
-	cb = DN_SKB_CB(skb);
-	if (decnet_debug_level & 4) {
-		char *devname = skb->dev ? skb->dev->name : "???";
-
-		printk(KERN_DEBUG
-			"DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
-			(int)cb->rt_flags, devname, skb->len,
-			le16_to_cpu(cb->src), le16_to_cpu(cb->dst),
-			err, skb->pkt_type);
-	}
-
-	if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) {
-		switch (cb->rt_flags & DN_RT_PKT_MSK) {
-		case DN_RT_PKT_SHORT:
-			return dn_return_short(skb);
-		case DN_RT_PKT_LONG:
-			return dn_return_long(skb);
-		}
-	}
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-static int dn_route_rx_long(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned char *ptr = skb->data;
-
-	if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */
-		goto drop_it;
-
-	skb_pull(skb, 20);
-	skb_reset_transport_header(skb);
-
-	/* Destination info */
-	ptr += 2;
-	cb->dst = dn_eth2dn(ptr);
-	if (memcmp(ptr, dn_hiord_addr, 4) != 0)
-		goto drop_it;
-	ptr += 6;
-
-
-	/* Source info */
-	ptr += 2;
-	cb->src = dn_eth2dn(ptr);
-	if (memcmp(ptr, dn_hiord_addr, 4) != 0)
-		goto drop_it;
-	ptr += 6;
-	/* Other junk */
-	ptr++;
-	cb->hops = *ptr++; /* Visit Count */
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
-		       &init_net, NULL, skb, skb->dev, NULL,
-		       dn_route_rx_packet);
-
-drop_it:
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-
-
-static int dn_route_rx_short(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned char *ptr = skb->data;
-
-	if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */
-		goto drop_it;
-
-	skb_pull(skb, 5);
-	skb_reset_transport_header(skb);
-
-	cb->dst = *(__le16 *)ptr;
-	ptr += 2;
-	cb->src = *(__le16 *)ptr;
-	ptr += 2;
-	cb->hops = *ptr & 0x3f;
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
-		       &init_net, NULL, skb, skb->dev, NULL,
-		       dn_route_rx_packet);
-
-drop_it:
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	/*
-	 * I know we drop the packet here, but that's considered success in
-	 * this case
-	 */
-	kfree_skb(skb);
-	return NET_RX_SUCCESS;
-}
-
-static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	dn_dev_hello(skb);
-	dn_neigh_pointopoint_hello(skb);
-	return NET_RX_SUCCESS;
-}
-
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
-{
-	struct dn_skb_cb *cb;
-	unsigned char flags = 0;
-	__u16 len = le16_to_cpu(*(__le16 *)skb->data);
-	struct dn_dev *dn = rcu_dereference(dev->dn_ptr);
-	unsigned char padlen = 0;
-
-	if (!net_eq(dev_net(dev), &init_net))
-		goto dump_it;
-
-	if (dn == NULL)
-		goto dump_it;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out;
-
-	if (!pskb_may_pull(skb, 3))
-		goto dump_it;
-
-	skb_pull(skb, 2);
-
-	if (len > skb->len)
-		goto dump_it;
-
-	skb_trim(skb, len);
-
-	flags = *skb->data;
-
-	cb = DN_SKB_CB(skb);
-	cb->stamp = jiffies;
-	cb->iif = dev->ifindex;
-
-	/*
-	 * If we have padding, remove it.
-	 */
-	if (flags & DN_RT_F_PF) {
-		padlen = flags & ~DN_RT_F_PF;
-		if (!pskb_may_pull(skb, padlen + 1))
-			goto dump_it;
-		skb_pull(skb, padlen);
-		flags = *skb->data;
-	}
-
-	skb_reset_network_header(skb);
-
-	/*
-	 * Weed out future version DECnet
-	 */
-	if (flags & DN_RT_F_VER)
-		goto dump_it;
-
-	cb->rt_flags = flags;
-
-	if (decnet_debug_level & 1)
-		printk(KERN_DEBUG
-			"dn_route_rcv: got 0x%02x from %s [%d %d %d]\n",
-			(int)flags, dev->name, len, skb->len,
-			padlen);
-
-	if (flags & DN_RT_PKT_CNTL) {
-		if (unlikely(skb_linearize(skb)))
-			goto dump_it;
-
-		switch (flags & DN_RT_CNTL_MSK) {
-		case DN_RT_PKT_INIT:
-			dn_dev_init_pkt(skb);
-			break;
-		case DN_RT_PKT_VERI:
-			dn_dev_veri_pkt(skb);
-			break;
-		}
-
-		if (dn->parms.state != DN_DEV_S_RU)
-			goto dump_it;
-
-		switch (flags & DN_RT_CNTL_MSK) {
-		case DN_RT_PKT_HELO:
-			return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
-				       &init_net, NULL, skb, skb->dev, NULL,
-				       dn_route_ptp_hello);
-
-		case DN_RT_PKT_L1RT:
-		case DN_RT_PKT_L2RT:
-			return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE,
-				       &init_net, NULL, skb, skb->dev, NULL,
-				       dn_route_discard);
-		case DN_RT_PKT_ERTH:
-			return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
-				       &init_net, NULL, skb, skb->dev, NULL,
-				       dn_neigh_router_hello);
-
-		case DN_RT_PKT_EEDH:
-			return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
-				       &init_net, NULL, skb, skb->dev, NULL,
-				       dn_neigh_endnode_hello);
-		}
-	} else {
-		if (dn->parms.state != DN_DEV_S_RU)
-			goto dump_it;
-
-		skb_pull(skb, 1); /* Pull flags */
-
-		switch (flags & DN_RT_PKT_MSK) {
-		case DN_RT_PKT_LONG:
-			return dn_route_rx_long(skb);
-		case DN_RT_PKT_SHORT:
-			return dn_route_rx_short(skb);
-		}
-	}
-
-dump_it:
-	kfree_skb(skb);
-out:
-	return NET_RX_DROP;
-}
-
-static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_route *rt = (struct dn_route *)dst;
-	struct net_device *dev = dst->dev;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-	int err = -EINVAL;
-
-	if (rt->n == NULL)
-		goto error;
-
-	skb->dev = dev;
-
-	cb->src = rt->rt_saddr;
-	cb->dst = rt->rt_daddr;
-
-	/*
-	 * Always set the Intra-Ethernet bit on all outgoing packets
-	 * originated on this node. Only valid flag from upper layers
-	 * is return-to-sender-requested. Set hop count to 0 too.
-	 */
-	cb->rt_flags &= ~DN_RT_F_RQR;
-	cb->rt_flags |= DN_RT_F_IE;
-	cb->hops = 0;
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT,
-		       &init_net, sk, skb, NULL, dev,
-		       dn_to_neigh_output);
-
-error:
-	net_dbg_ratelimited("dn_output: This should not happen\n");
-
-	kfree_skb(skb);
-
-	return err;
-}
-
-static int dn_forward(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
-	struct dn_route *rt;
-	int header_len;
-	struct net_device *dev = skb->dev;
-
-	if (skb->pkt_type != PACKET_HOST)
-		goto drop;
-
-	/* Ensure that we have enough space for headers */
-	rt = (struct dn_route *)skb_dst(skb);
-	header_len = dn_db->use_long ? 21 : 6;
-	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len))
-		goto drop;
-
-	/*
-	 * Hop count exceeded.
-	 */
-	if (++cb->hops > 30)
-		goto drop;
-
-	skb->dev = rt->dst.dev;
-
-	/*
-	 * If packet goes out same interface it came in on, then set
-	 * the Intra-Ethernet bit. This has no effect for short
-	 * packets, so we don't need to test for them here.
-	 */
-	cb->rt_flags &= ~DN_RT_F_IE;
-	if (rt->rt_flags & RTCF_DOREDIRECT)
-		cb->rt_flags |= DN_RT_F_IE;
-
-	return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD,
-		       &init_net, NULL, skb, dev, skb->dev,
-		       dn_to_neigh_output);
-
-drop:
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-/*
- * Used to catch bugs. This should never normally get
- * called.
- */
-static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-	net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
-			    le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
-
-	kfree_skb(skb);
-
-	return NET_RX_DROP;
-}
-
-static int dn_rt_bug(struct sk_buff *skb)
-{
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-	net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
-			    le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
-
-	kfree_skb(skb);
-
-	return NET_RX_DROP;
-}
-
-static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
-{
-	return dn_mss_from_pmtu(dst->dev, dst_mtu(dst));
-}
-
-static unsigned int dn_dst_mtu(const struct dst_entry *dst)
-{
-	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
-	return mtu ? : dst->dev->mtu;
-}
-
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
-					     struct sk_buff *skb,
-					     const void *daddr)
-{
-	return __neigh_lookup_errno(&dn_neigh_table, daddr, dst->dev);
-}
-
-static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
-{
-	struct dn_fib_info *fi = res->fi;
-	struct net_device *dev = rt->dst.dev;
-	unsigned int mss_metric;
-	struct neighbour *n;
-
-	if (fi) {
-		if (DN_FIB_RES_GW(*res) &&
-		    DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-			rt->rt_gateway = DN_FIB_RES_GW(*res);
-		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
-	}
-	rt->rt_type = res->type;
-
-	if (dev != NULL && rt->n == NULL) {
-		n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
-		if (IS_ERR(n))
-			return PTR_ERR(n);
-		rt->n = n;
-	}
-
-	if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
-		dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
-	mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
-	if (mss_metric) {
-		unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
-		if (mss_metric > mss)
-			dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
-	}
-	return 0;
-}
-
-static inline int dn_match_addr(__le16 addr1, __le16 addr2)
-{
-	__u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2);
-	int match = 16;
-	while (tmp) {
-		tmp >>= 1;
-		match--;
-	}
-	return match;
-}
-
-static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
-{
-	__le16 saddr = 0;
-	struct dn_dev *dn_db;
-	struct dn_ifaddr *ifa;
-	int best_match = 0;
-	int ret;
-
-	rcu_read_lock();
-	dn_db = rcu_dereference(dev->dn_ptr);
-	for (ifa = rcu_dereference(dn_db->ifa_list);
-	     ifa != NULL;
-	     ifa = rcu_dereference(ifa->ifa_next)) {
-		if (ifa->ifa_scope > scope)
-			continue;
-		if (!daddr) {
-			saddr = ifa->ifa_local;
-			break;
-		}
-		ret = dn_match_addr(daddr, ifa->ifa_local);
-		if (ret > best_match)
-			saddr = ifa->ifa_local;
-		if (best_match == 0)
-			saddr = ifa->ifa_local;
-	}
-	rcu_read_unlock();
-
-	return saddr;
-}
-
-static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
-{
-	return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
-}
-
-static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
-{
-	__le16 mask = dnet_make_mask(res->prefixlen);
-	return (daddr&~mask)|res->fi->fib_nh->nh_gw;
-}
-
-static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard)
-{
-	struct flowidn fld = {
-		.daddr = oldflp->daddr,
-		.saddr = oldflp->saddr,
-		.flowidn_scope = RT_SCOPE_UNIVERSE,
-		.flowidn_mark = oldflp->flowidn_mark,
-		.flowidn_iif = LOOPBACK_IFINDEX,
-		.flowidn_oif = oldflp->flowidn_oif,
-	};
-	struct dn_route *rt = NULL;
-	struct net_device *dev_out = NULL, *dev;
-	struct neighbour *neigh = NULL;
-	unsigned int hash;
-	unsigned int flags = 0;
-	struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
-	int err;
-	int free_res = 0;
-	__le16 gateway = 0;
-
-	if (decnet_debug_level & 16)
-		printk(KERN_DEBUG
-		       "dn_route_output_slow: dst=%04x src=%04x mark=%d"
-		       " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr),
-		       le16_to_cpu(oldflp->saddr),
-		       oldflp->flowidn_mark, LOOPBACK_IFINDEX,
-		       oldflp->flowidn_oif);
-
-	/* If we have an output interface, verify its a DECnet device */
-	if (oldflp->flowidn_oif) {
-		dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif);
-		err = -ENODEV;
-		if (dev_out && dev_out->dn_ptr == NULL) {
-			dev_put(dev_out);
-			dev_out = NULL;
-		}
-		if (dev_out == NULL)
-			goto out;
-	}
-
-	/* If we have a source address, verify that its a local address */
-	if (oldflp->saddr) {
-		err = -EADDRNOTAVAIL;
-
-		if (dev_out) {
-			if (dn_dev_islocal(dev_out, oldflp->saddr))
-				goto source_ok;
-			dev_put(dev_out);
-			goto out;
-		}
-		rcu_read_lock();
-		for_each_netdev_rcu(&init_net, dev) {
-			if (!dev->dn_ptr)
-				continue;
-			if (!dn_dev_islocal(dev, oldflp->saddr))
-				continue;
-			if ((dev->flags & IFF_LOOPBACK) &&
-			    oldflp->daddr &&
-			    !dn_dev_islocal(dev, oldflp->daddr))
-				continue;
-
-			dev_out = dev;
-			break;
-		}
-		rcu_read_unlock();
-		if (dev_out == NULL)
-			goto out;
-		dev_hold(dev_out);
-source_ok:
-		;
-	}
-
-	/* No destination? Assume its local */
-	if (!fld.daddr) {
-		fld.daddr = fld.saddr;
-
-		dev_put(dev_out);
-		err = -EINVAL;
-		dev_out = init_net.loopback_dev;
-		if (!dev_out->dn_ptr)
-			goto out;
-		err = -EADDRNOTAVAIL;
-		dev_hold(dev_out);
-		if (!fld.daddr) {
-			fld.daddr =
-			fld.saddr = dnet_select_source(dev_out, 0,
-						       RT_SCOPE_HOST);
-			if (!fld.daddr)
-				goto done;
-		}
-		fld.flowidn_oif = LOOPBACK_IFINDEX;
-		res.type = RTN_LOCAL;
-		goto make_route;
-	}
-
-	if (decnet_debug_level & 16)
-		printk(KERN_DEBUG
-		       "dn_route_output_slow: initial checks complete."
-		       " dst=%04x src=%04x oif=%d try_hard=%d\n",
-		       le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr),
-		       fld.flowidn_oif, try_hard);
-
-	/*
-	 * N.B. If the kernel is compiled without router support then
-	 * dn_fib_lookup() will evaluate to non-zero so this if () block
-	 * will always be executed.
-	 */
-	err = -ESRCH;
-	if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) {
-		struct dn_dev *dn_db;
-		if (err != -ESRCH)
-			goto out;
-		/*
-		 * Here the fallback is basically the standard algorithm for
-		 * routing in endnodes which is described in the DECnet routing
-		 * docs
-		 *
-		 * If we are not trying hard, look in neighbour cache.
-		 * The result is tested to ensure that if a specific output
-		 * device/source address was requested, then we honour that
-		 * here
-		 */
-		if (!try_hard) {
-			neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr);
-			if (neigh) {
-				if ((oldflp->flowidn_oif &&
-				    (neigh->dev->ifindex != oldflp->flowidn_oif)) ||
-				    (oldflp->saddr &&
-				    (!dn_dev_islocal(neigh->dev,
-						     oldflp->saddr)))) {
-					neigh_release(neigh);
-					neigh = NULL;
-				} else {
-					dev_put(dev_out);
-					if (dn_dev_islocal(neigh->dev, fld.daddr)) {
-						dev_out = init_net.loopback_dev;
-						res.type = RTN_LOCAL;
-					} else {
-						dev_out = neigh->dev;
-					}
-					dev_hold(dev_out);
-					goto select_source;
-				}
-			}
-		}
-
-		/* Not there? Perhaps its a local address */
-		if (dev_out == NULL)
-			dev_out = dn_dev_get_default();
-		err = -ENODEV;
-		if (dev_out == NULL)
-			goto out;
-		dn_db = rcu_dereference_raw(dev_out->dn_ptr);
-		if (!dn_db)
-			goto e_inval;
-		/* Possible improvement - check all devices for local addr */
-		if (dn_dev_islocal(dev_out, fld.daddr)) {
-			dev_put(dev_out);
-			dev_out = init_net.loopback_dev;
-			dev_hold(dev_out);
-			res.type = RTN_LOCAL;
-			goto select_source;
-		}
-		/* Not local either.... try sending it to the default router */
-		neigh = neigh_clone(dn_db->router);
-		BUG_ON(neigh && neigh->dev != dev_out);
-
-		/* Ok then, we assume its directly connected and move on */
-select_source:
-		if (neigh)
-			gateway = container_of(neigh, struct dn_neigh, n)->addr;
-		if (gateway == 0)
-			gateway = fld.daddr;
-		if (fld.saddr == 0) {
-			fld.saddr = dnet_select_source(dev_out, gateway,
-						       res.type == RTN_LOCAL ?
-						       RT_SCOPE_HOST :
-						       RT_SCOPE_LINK);
-			if (fld.saddr == 0 && res.type != RTN_LOCAL)
-				goto e_addr;
-		}
-		fld.flowidn_oif = dev_out->ifindex;
-		goto make_route;
-	}
-	free_res = 1;
-
-	if (res.type == RTN_NAT)
-		goto e_inval;
-
-	if (res.type == RTN_LOCAL) {
-		if (!fld.saddr)
-			fld.saddr = fld.daddr;
-		dev_put(dev_out);
-		dev_out = init_net.loopback_dev;
-		dev_hold(dev_out);
-		if (!dev_out->dn_ptr)
-			goto e_inval;
-		fld.flowidn_oif = dev_out->ifindex;
-		if (res.fi)
-			dn_fib_info_put(res.fi);
-		res.fi = NULL;
-		goto make_route;
-	}
-
-	if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
-		dn_fib_select_multipath(&fld, &res);
-
-	/*
-	 * We could add some logic to deal with default routes here and
-	 * get rid of some of the special casing above.
-	 */
-
-	if (!fld.saddr)
-		fld.saddr = DN_FIB_RES_PREFSRC(res);
-
-	dev_put(dev_out);
-	dev_out = DN_FIB_RES_DEV(res);
-	dev_hold(dev_out);
-	fld.flowidn_oif = dev_out->ifindex;
-	gateway = DN_FIB_RES_GW(res);
-
-make_route:
-	if (dev_out->flags & IFF_LOOPBACK)
-		flags |= RTCF_LOCAL;
-
-	rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, 0);
-	if (rt == NULL)
-		goto e_nobufs;
-
-	rt->dn_next = NULL;
-	memset(&rt->fld, 0, sizeof(rt->fld));
-	rt->fld.saddr        = oldflp->saddr;
-	rt->fld.daddr        = oldflp->daddr;
-	rt->fld.flowidn_oif  = oldflp->flowidn_oif;
-	rt->fld.flowidn_iif  = 0;
-	rt->fld.flowidn_mark = oldflp->flowidn_mark;
-
-	rt->rt_saddr      = fld.saddr;
-	rt->rt_daddr      = fld.daddr;
-	rt->rt_gateway    = gateway ? gateway : fld.daddr;
-	rt->rt_local_src  = fld.saddr;
-
-	rt->rt_dst_map    = fld.daddr;
-	rt->rt_src_map    = fld.saddr;
-
-	rt->n = neigh;
-	neigh = NULL;
-
-	rt->dst.lastuse = jiffies;
-	rt->dst.output  = dn_output;
-	rt->dst.input   = dn_rt_bug;
-	rt->rt_flags      = flags;
-	if (flags & RTCF_LOCAL)
-		rt->dst.input = dn_nsp_rx;
-
-	err = dn_rt_set_next_hop(rt, &res);
-	if (err)
-		goto e_neighbour;
-
-	hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
-	/* dn_insert_route() increments dst->__refcnt */
-	dn_insert_route(rt, hash, (struct dn_route **)pprt);
-
-done:
-	if (neigh)
-		neigh_release(neigh);
-	if (free_res)
-		dn_fib_res_put(&res);
-	dev_put(dev_out);
-out:
-	return err;
-
-e_addr:
-	err = -EADDRNOTAVAIL;
-	goto done;
-e_inval:
-	err = -EINVAL;
-	goto done;
-e_nobufs:
-	err = -ENOBUFS;
-	goto done;
-e_neighbour:
-	dst_release_immediate(&rt->dst);
-	goto e_nobufs;
-}
-
-
-/*
- * N.B. The flags may be moved into the flowi at some future stage.
- */
-static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags)
-{
-	unsigned int hash = dn_hash(flp->saddr, flp->daddr);
-	struct dn_route *rt = NULL;
-
-	if (!(flags & MSG_TRYHARD)) {
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
-			rt = rcu_dereference_bh(rt->dn_next)) {
-			if ((flp->daddr == rt->fld.daddr) &&
-			    (flp->saddr == rt->fld.saddr) &&
-			    (flp->flowidn_mark == rt->fld.flowidn_mark) &&
-			    dn_is_output_route(rt) &&
-			    (rt->fld.flowidn_oif == flp->flowidn_oif)) {
-				dst_hold_and_use(&rt->dst, jiffies);
-				rcu_read_unlock_bh();
-				*pprt = &rt->dst;
-				return 0;
-			}
-		}
-		rcu_read_unlock_bh();
-	}
-
-	return dn_route_output_slow(pprt, flp, flags);
-}
-
-static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags)
-{
-	int err;
-
-	err = __dn_route_output_key(pprt, flp, flags);
-	if (err == 0 && flp->flowidn_proto) {
-		*pprt = xfrm_lookup(&init_net, *pprt,
-				    flowidn_to_flowi(flp), NULL, 0);
-		if (IS_ERR(*pprt)) {
-			err = PTR_ERR(*pprt);
-			*pprt = NULL;
-		}
-	}
-	return err;
-}
-
-int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, struct sock *sk, int flags)
-{
-	int err;
-
-	err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
-	if (err == 0 && fl->flowidn_proto) {
-		*pprt = xfrm_lookup(&init_net, *pprt,
-				    flowidn_to_flowi(fl), sk, 0);
-		if (IS_ERR(*pprt)) {
-			err = PTR_ERR(*pprt);
-			*pprt = NULL;
-		}
-	}
-	return err;
-}
-
-static int dn_route_input_slow(struct sk_buff *skb)
-{
-	struct dn_route *rt = NULL;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	struct net_device *in_dev = skb->dev;
-	struct net_device *out_dev = NULL;
-	struct dn_dev *dn_db;
-	struct neighbour *neigh = NULL;
-	unsigned int hash;
-	int flags = 0;
-	__le16 gateway = 0;
-	__le16 local_src = 0;
-	struct flowidn fld = {
-		.daddr = cb->dst,
-		.saddr = cb->src,
-		.flowidn_scope = RT_SCOPE_UNIVERSE,
-		.flowidn_mark = skb->mark,
-		.flowidn_iif = skb->dev->ifindex,
-	};
-	struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
-	int err = -EINVAL;
-	int free_res = 0;
-
-	dev_hold(in_dev);
-
-	dn_db = rcu_dereference(in_dev->dn_ptr);
-	if (!dn_db)
-		goto out;
-
-	/* Zero source addresses are not allowed */
-	if (fld.saddr == 0)
-		goto out;
-
-	/*
-	 * In this case we've just received a packet from a source
-	 * outside ourselves pretending to come from us. We don't
-	 * allow it any further to prevent routing loops, spoofing and
-	 * other nasties. Loopback packets already have the dst attached
-	 * so this only affects packets which have originated elsewhere.
-	 */
-	err  = -ENOTUNIQ;
-	if (dn_dev_islocal(in_dev, cb->src))
-		goto out;
-
-	err = dn_fib_lookup(&fld, &res);
-	if (err) {
-		if (err != -ESRCH)
-			goto out;
-		/*
-		 * Is the destination us ?
-		 */
-		if (!dn_dev_islocal(in_dev, cb->dst))
-			goto e_inval;
-
-		res.type = RTN_LOCAL;
-	} else {
-		__le16 src_map = fld.saddr;
-		free_res = 1;
-
-		out_dev = DN_FIB_RES_DEV(res);
-		if (out_dev == NULL) {
-			net_crit_ratelimited("Bug in dn_route_input_slow() No output device\n");
-			goto e_inval;
-		}
-		dev_hold(out_dev);
-
-		if (res.r)
-			src_map = fld.saddr; /* no NAT support for now */
-
-		gateway = DN_FIB_RES_GW(res);
-		if (res.type == RTN_NAT) {
-			fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res);
-			dn_fib_res_put(&res);
-			free_res = 0;
-			if (dn_fib_lookup(&fld, &res))
-				goto e_inval;
-			free_res = 1;
-			if (res.type != RTN_UNICAST)
-				goto e_inval;
-			flags |= RTCF_DNAT;
-			gateway = fld.daddr;
-		}
-		fld.saddr = src_map;
-	}
-
-	switch (res.type) {
-	case RTN_UNICAST:
-		/*
-		 * Forwarding check here, we only check for forwarding
-		 * being turned off, if you want to only forward intra
-		 * area, its up to you to set the routing tables up
-		 * correctly.
-		 */
-		if (dn_db->parms.forwarding == 0)
-			goto e_inval;
-
-		if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
-			dn_fib_select_multipath(&fld, &res);
-
-		/*
-		 * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
-		 * flag as a hint to set the intra-ethernet bit when
-		 * forwarding. If we've got NAT in operation, we don't do
-		 * this optimisation.
-		 */
-		if (out_dev == in_dev && !(flags & RTCF_NAT))
-			flags |= RTCF_DOREDIRECT;
-
-		local_src = DN_FIB_RES_PREFSRC(res);
-		break;
-	case RTN_BLACKHOLE:
-	case RTN_UNREACHABLE:
-		break;
-	case RTN_LOCAL:
-		flags |= RTCF_LOCAL;
-		fld.saddr = cb->dst;
-		fld.daddr = cb->src;
-
-		/* Routing tables gave us a gateway */
-		if (gateway)
-			goto make_route;
-
-		/* Packet was intra-ethernet, so we know its on-link */
-		if (cb->rt_flags & DN_RT_F_IE) {
-			gateway = cb->src;
-			goto make_route;
-		}
-
-		/* Use the default router if there is one */
-		neigh = neigh_clone(dn_db->router);
-		if (neigh) {
-			gateway = container_of(neigh, struct dn_neigh, n)->addr;
-			goto make_route;
-		}
-
-		/* Close eyes and pray */
-		gateway = cb->src;
-		goto make_route;
-	default:
-		goto e_inval;
-	}
-
-make_route:
-	rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, 0);
-	if (rt == NULL)
-		goto e_nobufs;
-
-	rt->dn_next = NULL;
-	memset(&rt->fld, 0, sizeof(rt->fld));
-	rt->rt_saddr      = fld.saddr;
-	rt->rt_daddr      = fld.daddr;
-	rt->rt_gateway    = fld.daddr;
-	if (gateway)
-		rt->rt_gateway = gateway;
-	rt->rt_local_src  = local_src ? local_src : rt->rt_saddr;
-
-	rt->rt_dst_map    = fld.daddr;
-	rt->rt_src_map    = fld.saddr;
-
-	rt->fld.saddr        = cb->src;
-	rt->fld.daddr        = cb->dst;
-	rt->fld.flowidn_oif  = 0;
-	rt->fld.flowidn_iif  = in_dev->ifindex;
-	rt->fld.flowidn_mark = fld.flowidn_mark;
-
-	rt->n = neigh;
-	rt->dst.lastuse = jiffies;
-	rt->dst.output = dn_rt_bug_out;
-	switch (res.type) {
-	case RTN_UNICAST:
-		rt->dst.input = dn_forward;
-		break;
-	case RTN_LOCAL:
-		rt->dst.output = dn_output;
-		rt->dst.input = dn_nsp_rx;
-		rt->dst.dev = in_dev;
-		flags |= RTCF_LOCAL;
-		break;
-	default:
-	case RTN_UNREACHABLE:
-	case RTN_BLACKHOLE:
-		rt->dst.input = dst_discard;
-	}
-	rt->rt_flags = flags;
-
-	err = dn_rt_set_next_hop(rt, &res);
-	if (err)
-		goto e_neighbour;
-
-	hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
-	/* dn_insert_route() increments dst->__refcnt */
-	dn_insert_route(rt, hash, &rt);
-	skb_dst_set(skb, &rt->dst);
-
-done:
-	if (neigh)
-		neigh_release(neigh);
-	if (free_res)
-		dn_fib_res_put(&res);
-	dev_put(in_dev);
-	dev_put(out_dev);
-out:
-	return err;
-
-e_inval:
-	err = -EINVAL;
-	goto done;
-
-e_nobufs:
-	err = -ENOBUFS;
-	goto done;
-
-e_neighbour:
-	dst_release_immediate(&rt->dst);
-	goto done;
-}
-
-static int dn_route_input(struct sk_buff *skb)
-{
-	struct dn_route *rt;
-	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned int hash = dn_hash(cb->src, cb->dst);
-
-	if (skb_dst(skb))
-		return 0;
-
-	rcu_read_lock();
-	for (rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
-	    rt = rcu_dereference(rt->dn_next)) {
-		if ((rt->fld.saddr == cb->src) &&
-		    (rt->fld.daddr == cb->dst) &&
-		    (rt->fld.flowidn_oif == 0) &&
-		    (rt->fld.flowidn_mark == skb->mark) &&
-		    (rt->fld.flowidn_iif == cb->iif)) {
-			dst_hold_and_use(&rt->dst, jiffies);
-			rcu_read_unlock();
-			skb_dst_set(skb, (struct dst_entry *)rt);
-			return 0;
-		}
-	}
-	rcu_read_unlock();
-
-	return dn_route_input_slow(skb);
-}
-
-static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
-			   int event, int nowait, unsigned int flags)
-{
-	struct dn_route *rt = (struct dn_route *)skb_dst(skb);
-	struct rtmsg *r;
-	struct nlmsghdr *nlh;
-	long expires;
-
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	r = nlmsg_data(nlh);
-	r->rtm_family = AF_DECnet;
-	r->rtm_dst_len = 16;
-	r->rtm_src_len = 0;
-	r->rtm_tos = 0;
-	r->rtm_table = RT_TABLE_MAIN;
-	r->rtm_type = rt->rt_type;
-	r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
-	r->rtm_scope = RT_SCOPE_UNIVERSE;
-	r->rtm_protocol = RTPROT_UNSPEC;
-
-	if (rt->rt_flags & RTCF_NOTIFY)
-		r->rtm_flags |= RTM_F_NOTIFY;
-
-	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN) < 0 ||
-	    nla_put_le16(skb, RTA_DST, rt->rt_daddr) < 0)
-		goto errout;
-
-	if (rt->fld.saddr) {
-		r->rtm_src_len = 16;
-		if (nla_put_le16(skb, RTA_SRC, rt->fld.saddr) < 0)
-			goto errout;
-	}
-	if (rt->dst.dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex) < 0)
-		goto errout;
-
-	/*
-	 * Note to self - change this if input routes reverse direction when
-	 * they deal only with inputs and not with replies like they do
-	 * currently.
-	 */
-	if (nla_put_le16(skb, RTA_PREFSRC, rt->rt_local_src) < 0)
-		goto errout;
-
-	if (rt->rt_daddr != rt->rt_gateway &&
-	    nla_put_le16(skb, RTA_GATEWAY, rt->rt_gateway) < 0)
-		goto errout;
-
-	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
-		goto errout;
-
-	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
-			       rt->dst.error) < 0)
-		goto errout;
-
-	if (dn_is_input_route(rt) &&
-	    nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0)
-		goto errout;
-
-	nlmsg_end(skb, nlh);
-	return 0;
-
-errout:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = {
-	[RTA_DST]		= { .type = NLA_U16 },
-	[RTA_SRC]		= { .type = NLA_U16 },
-	[RTA_IIF]		= { .type = NLA_U32 },
-	[RTA_OIF]		= { .type = NLA_U32 },
-	[RTA_GATEWAY]		= { .type = NLA_U16 },
-	[RTA_PRIORITY]		= { .type = NLA_U32 },
-	[RTA_PREFSRC]		= { .type = NLA_U16 },
-	[RTA_METRICS]		= { .type = NLA_NESTED },
-	[RTA_MULTIPATH]		= { .type = NLA_NESTED },
-	[RTA_TABLE]		= { .type = NLA_U32 },
-	[RTA_MARK]		= { .type = NLA_U32 },
-};
-
-/*
- * This is called by both endnodes and routers now.
- */
-static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
-			     struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm = nlmsg_data(nlh);
-	struct dn_route *rt = NULL;
-	struct dn_skb_cb *cb;
-	int err;
-	struct sk_buff *skb;
-	struct flowidn fld;
-	struct nlattr *tb[RTA_MAX+1];
-
-	if (!net_eq(net, &init_net))
-		return -EINVAL;
-
-	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
-				     rtm_dn_policy, extack);
-	if (err < 0)
-		return err;
-
-	memset(&fld, 0, sizeof(fld));
-	fld.flowidn_proto = DNPROTO_NSP;
-
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb == NULL)
-		return -ENOBUFS;
-	skb_reset_mac_header(skb);
-	cb = DN_SKB_CB(skb);
-
-	if (tb[RTA_SRC])
-		fld.saddr = nla_get_le16(tb[RTA_SRC]);
-
-	if (tb[RTA_DST])
-		fld.daddr = nla_get_le16(tb[RTA_DST]);
-
-	if (tb[RTA_IIF])
-		fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]);
-
-	if (fld.flowidn_iif) {
-		struct net_device *dev;
-		dev = __dev_get_by_index(&init_net, fld.flowidn_iif);
-		if (!dev || !dev->dn_ptr) {
-			kfree_skb(skb);
-			return -ENODEV;
-		}
-		skb->protocol = htons(ETH_P_DNA_RT);
-		skb->dev = dev;
-		cb->src = fld.saddr;
-		cb->dst = fld.daddr;
-		local_bh_disable();
-		err = dn_route_input(skb);
-		local_bh_enable();
-		memset(cb, 0, sizeof(struct dn_skb_cb));
-		rt = (struct dn_route *)skb_dst(skb);
-		if (!err && -rt->dst.error)
-			err = rt->dst.error;
-	} else {
-		if (tb[RTA_OIF])
-			fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]);
-
-		err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
-	}
-
-	skb->dev = NULL;
-	if (err)
-		goto out_free;
-	skb_dst_set(skb, &rt->dst);
-	if (rtm->rtm_flags & RTM_F_NOTIFY)
-		rt->rt_flags |= RTCF_NOTIFY;
-
-	err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
-	if (err < 0) {
-		err = -EMSGSIZE;
-		goto out_free;
-	}
-
-	return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).portid);
-
-out_free:
-	kfree_skb(skb);
-	return err;
-}
-
-/*
- * For routers, this is called from dn_fib_dump, but for endnodes its
- * called directly from the rtnetlink dispatch table.
- */
-int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct net *net = sock_net(skb->sk);
-	struct dn_route *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct rtmsg *rtm;
-
-	if (!net_eq(net, &init_net))
-		return 0;
-
-	if (nlmsg_len(cb->nlh) < sizeof(struct rtmsg))
-		return -EINVAL;
-
-	rtm = nlmsg_data(cb->nlh);
-	if (!(rtm->rtm_flags & RTM_F_CLONED))
-		return 0;
-
-	s_h = cb->args[0];
-	s_idx = idx = cb->args[1];
-	for (h = 0; h <= dn_rt_hash_mask; h++) {
-		if (h < s_h)
-			continue;
-		if (h > s_h)
-			s_idx = 0;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
-			rt;
-			rt = rcu_dereference_bh(rt->dn_next), idx++) {
-			if (idx < s_idx)
-				continue;
-			skb_dst_set(skb, dst_clone(&rt->dst));
-			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					1, NLM_F_MULTI) < 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
-	return skb->len;
-}
-
-#ifdef CONFIG_PROC_FS
-struct dn_rt_cache_iter_state {
-	int bucket;
-};
-
-static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
-{
-	struct dn_route *rt = NULL;
-	struct dn_rt_cache_iter_state *s = seq->private;
-
-	for (s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
-		rcu_read_lock_bh();
-		rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
-		if (rt)
-			break;
-		rcu_read_unlock_bh();
-	}
-	return rt;
-}
-
-static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt)
-{
-	struct dn_rt_cache_iter_state *s = seq->private;
-
-	rt = rcu_dereference_bh(rt->dn_next);
-	while (!rt) {
-		rcu_read_unlock_bh();
-		if (--s->bucket < 0)
-			break;
-		rcu_read_lock_bh();
-		rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
-	}
-	return rt;
-}
-
-static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
-{
-	struct dn_route *rt = dn_rt_cache_get_first(seq);
-
-	if (rt) {
-		while (*pos && (rt = dn_rt_cache_get_next(seq, rt)))
-			--*pos;
-	}
-	return *pos ? NULL : rt;
-}
-
-static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct dn_route *rt = dn_rt_cache_get_next(seq, v);
-	++*pos;
-	return rt;
-}
-
-static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v)
-{
-	if (v)
-		rcu_read_unlock_bh();
-}
-
-static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
-{
-	struct dn_route *rt = v;
-	char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
-
-	seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
-		   rt->dst.dev ? rt->dst.dev->name : "*",
-		   dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
-		   dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
-		   atomic_read(&rt->dst.__refcnt),
-		   rt->dst.__use, 0);
-	return 0;
-}
-
-static const struct seq_operations dn_rt_cache_seq_ops = {
-	.start	= dn_rt_cache_seq_start,
-	.next	= dn_rt_cache_seq_next,
-	.stop	= dn_rt_cache_seq_stop,
-	.show	= dn_rt_cache_seq_show,
-};
-#endif /* CONFIG_PROC_FS */
-
-void __init dn_route_init(void)
-{
-	int i, goal, order;
-
-	dn_dst_ops.kmem_cachep =
-		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	dst_entries_init(&dn_dst_ops);
-	timer_setup(&dn_route_timer, dn_dst_check_expire, 0);
-	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
-	add_timer(&dn_route_timer);
-
-	goal = totalram_pages() >> (26 - PAGE_SHIFT);
-
-	for (order = 0; (1UL << order) < goal; order++)
-		/* NOTHING */;
-
-	/*
-	 * Only want 1024 entries max, since the table is very, very unlikely
-	 * to be larger than that.
-	 */
-	while (order && ((((1UL << order) * PAGE_SIZE) /
-				sizeof(struct dn_rt_hash_bucket)) >= 2048))
-		order--;
-
-	do {
-		dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
-			sizeof(struct dn_rt_hash_bucket);
-		while (dn_rt_hash_mask & (dn_rt_hash_mask - 1))
-			dn_rt_hash_mask--;
-		dn_rt_hash_table = (struct dn_rt_hash_bucket *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (dn_rt_hash_table == NULL && --order > 0);
-
-	if (!dn_rt_hash_table)
-		panic("Failed to allocate DECnet route cache hash table\n");
-
-	printk(KERN_INFO
-		"DECnet: Routing cache hash table of %u buckets, %ldKbytes\n",
-		dn_rt_hash_mask,
-		(long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
-
-	dn_rt_hash_mask--;
-	for (i = 0; i <= dn_rt_hash_mask; i++) {
-		spin_lock_init(&dn_rt_hash_table[i].lock);
-		dn_rt_hash_table[i].chain = NULL;
-	}
-
-	dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
-
-	proc_create_seq_private("decnet_cache", 0444, init_net.proc_net,
-			&dn_rt_cache_seq_ops,
-			sizeof(struct dn_rt_cache_iter_state), NULL);
-
-#ifdef CONFIG_DECNET_ROUTER
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
-			     dn_cache_getroute, dn_fib_dump, 0);
-#else
-	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
-			     dn_cache_getroute, dn_cache_dump, 0);
-#endif
-}
-
-void __exit dn_route_cleanup(void)
-{
-	del_timer(&dn_route_timer);
-	dn_run_flush(NULL);
-
-	remove_proc_entry("decnet_cache", init_net.proc_net);
-	dst_entries_destroy(&dn_dst_ops);
-}
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
deleted file mode 100644
index ee73057529cf..000000000000
--- a/net/decnet/dn_rules.c
+++ /dev/null
@@ -1,253 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Routing Forwarding Information Base (Rules)
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *              Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c
- *
- *
- * Changes:
- *              Steve Whitehouse <steve@chygwyn.com>
- *              Updated for Thomas Graf's generic rules
- *
- */
-#include <linux/net.h>
-#include <linux/init.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/export.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-static struct fib_rules_ops *dn_fib_rules_ops;
-
-struct dn_fib_rule
-{
-	struct fib_rule		common;
-	unsigned char		dst_len;
-	unsigned char		src_len;
-	__le16			src;
-	__le16			srcmask;
-	__le16			dst;
-	__le16			dstmask;
-	__le16			srcmap;
-	u8			flags;
-};
-
-
-int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res)
-{
-	struct fib_lookup_arg arg = {
-		.result = res,
-	};
-	int err;
-
-	err = fib_rules_lookup(dn_fib_rules_ops,
-			       flowidn_to_flowi(flp), 0, &arg);
-	res->r = arg.rule;
-
-	return err;
-}
-
-static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
-			      int flags, struct fib_lookup_arg *arg)
-{
-	struct flowidn *fld = &flp->u.dn;
-	int err = -EAGAIN;
-	struct dn_fib_table *tbl;
-
-	switch(rule->action) {
-	case FR_ACT_TO_TBL:
-		break;
-
-	case FR_ACT_UNREACHABLE:
-		err = -ENETUNREACH;
-		goto errout;
-
-	case FR_ACT_PROHIBIT:
-		err = -EACCES;
-		goto errout;
-
-	case FR_ACT_BLACKHOLE:
-	default:
-		err = -EINVAL;
-		goto errout;
-	}
-
-	tbl = dn_fib_get_table(rule->table, 0);
-	if (tbl == NULL)
-		goto errout;
-
-	err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result);
-	if (err > 0)
-		err = -EAGAIN;
-errout:
-	return err;
-}
-
-static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
-{
-	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-	struct flowidn *fld = &fl->u.dn;
-	__le16 daddr = fld->daddr;
-	__le16 saddr = fld->saddr;
-
-	if (((saddr ^ r->src) & r->srcmask) ||
-	    ((daddr ^ r->dst) & r->dstmask))
-		return 0;
-
-	return 1;
-}
-
-static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-				 struct fib_rule_hdr *frh,
-				 struct nlattr **tb,
-				 struct netlink_ext_ack *extack)
-{
-	int err = -EINVAL;
-	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
-	if (frh->tos) {
-		NL_SET_ERR_MSG(extack, "Invalid tos value");
-		goto  errout;
-	}
-
-	if (rule->table == RT_TABLE_UNSPEC) {
-		if (rule->action == FR_ACT_TO_TBL) {
-			struct dn_fib_table *table;
-
-			table = dn_fib_empty_table();
-			if (table == NULL) {
-				err = -ENOBUFS;
-				goto errout;
-			}
-
-			rule->table = table->n;
-		}
-	}
-
-	if (frh->src_len)
-		r->src = nla_get_le16(tb[FRA_SRC]);
-
-	if (frh->dst_len)
-		r->dst = nla_get_le16(tb[FRA_DST]);
-
-	r->src_len = frh->src_len;
-	r->srcmask = dnet_make_mask(r->src_len);
-	r->dst_len = frh->dst_len;
-	r->dstmask = dnet_make_mask(r->dst_len);
-	err = 0;
-errout:
-	return err;
-}
-
-static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
-{
-	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
-	if (frh->src_len && (r->src_len != frh->src_len))
-		return 0;
-
-	if (frh->dst_len && (r->dst_len != frh->dst_len))
-		return 0;
-
-	if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC])))
-		return 0;
-
-	if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST])))
-		return 0;
-
-	return 1;
-}
-
-unsigned int dnet_addr_type(__le16 addr)
-{
-	struct flowidn fld = { .daddr = addr };
-	struct dn_fib_res res;
-	unsigned int ret = RTN_UNICAST;
-	struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
-
-	res.r = NULL;
-
-	if (tb) {
-		if (!tb->lookup(tb, &fld, &res)) {
-			ret = res.type;
-			dn_fib_res_put(&res);
-		}
-	}
-	return ret;
-}
-
-static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
-			    struct fib_rule_hdr *frh)
-{
-	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
-	frh->dst_len = r->dst_len;
-	frh->src_len = r->src_len;
-	frh->tos = 0;
-
-	if ((r->dst_len &&
-	     nla_put_le16(skb, FRA_DST, r->dst)) ||
-	    (r->src_len &&
-	     nla_put_le16(skb, FRA_SRC, r->src)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -ENOBUFS;
-}
-
-static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
-{
-	dn_rt_cache_flush(-1);
-}
-
-static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
-	.family		= AF_DECnet,
-	.rule_size	= sizeof(struct dn_fib_rule),
-	.addr_size	= sizeof(u16),
-	.action		= dn_fib_rule_action,
-	.match		= dn_fib_rule_match,
-	.configure	= dn_fib_rule_configure,
-	.compare	= dn_fib_rule_compare,
-	.fill		= dn_fib_rule_fill,
-	.flush_cache	= dn_fib_rule_flush_cache,
-	.nlgroup	= RTNLGRP_DECnet_RULE,
-	.owner		= THIS_MODULE,
-	.fro_net	= &init_net,
-};
-
-void __init dn_fib_rules_init(void)
-{
-	dn_fib_rules_ops =
-		fib_rules_register(&dn_fib_rules_ops_template, &init_net);
-	BUG_ON(IS_ERR(dn_fib_rules_ops));
-	BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
-			            RT_TABLE_MAIN, 0));
-}
-
-void __exit dn_fib_rules_cleanup(void)
-{
-	rtnl_lock();
-	fib_rules_unregister(dn_fib_rules_ops);
-	rtnl_unlock();
-	rcu_barrier();
-}
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
deleted file mode 100644
index 4086f9c746af..000000000000
--- a/net/decnet/dn_table.c
+++ /dev/null
@@ -1,929 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Routing Forwarding Information Base (Routing Tables)
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *              Mostly copied from the IPv4 routing code
- *
- *
- * Changes:
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <linux/route.h> /* RTF_xxx */
-#include <net/neighbour.h>
-#include <net/netlink.h>
-#include <net/tcp.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-
-struct dn_zone
-{
-	struct dn_zone		*dz_next;
-	struct dn_fib_node 	**dz_hash;
-	int			dz_nent;
-	int			dz_divisor;
-	u32			dz_hashmask;
-#define DZ_HASHMASK(dz)	((dz)->dz_hashmask)
-	int			dz_order;
-	__le16			dz_mask;
-#define DZ_MASK(dz)	((dz)->dz_mask)
-};
-
-struct dn_hash
-{
-	struct dn_zone	*dh_zones[17];
-	struct dn_zone	*dh_zone_list;
-};
-
-#define dz_key_0(key)		((key).datum = 0)
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
-	for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-#define DN_MAX_DIVISOR 1024
-#define DN_S_ZOMBIE 1
-#define DN_S_ACCESSED 2
-
-#define DN_FIB_SCAN(f, fp) \
-for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
-
-#define DN_FIB_SCAN_KEY(f, fp, key) \
-for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
-
-#define RT_TABLE_MIN 1
-#define DN_FIB_TABLE_HASHSZ 256
-static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
-static DEFINE_RWLOCK(dn_fib_tables_lock);
-
-static struct kmem_cache *dn_hash_kmem __read_mostly;
-static int dn_fib_hash_zombies;
-
-static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
-{
-	u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order);
-	h ^= (h >> 10);
-	h ^= (h >> 6);
-	h &= DZ_HASHMASK(dz);
-	return *(dn_fib_idx_t *)&h;
-}
-
-static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
-{
-	dn_fib_key_t k;
-	k.datum = dst & DZ_MASK(dz);
-	return k;
-}
-
-static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz)
-{
-	return &dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz)
-{
-	return dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b)
-{
-	return a.datum == b.datum;
-}
-
-static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b)
-{
-	return a.datum <= b.datum;
-}
-
-static inline void dn_rebuild_zone(struct dn_zone *dz,
-				   struct dn_fib_node **old_ht,
-				   int old_divisor)
-{
-	struct dn_fib_node *f, **fp, *next;
-	int i;
-
-	for(i = 0; i < old_divisor; i++) {
-		for(f = old_ht[i]; f; f = next) {
-			next = f->fn_next;
-			for(fp = dn_chain_p(f->fn_key, dz);
-				*fp && dn_key_leq((*fp)->fn_key, f->fn_key);
-				fp = &(*fp)->fn_next)
-				/* NOTHING */;
-			f->fn_next = *fp;
-			*fp = f;
-		}
-	}
-}
-
-static void dn_rehash_zone(struct dn_zone *dz)
-{
-	struct dn_fib_node **ht, **old_ht;
-	int old_divisor, new_divisor;
-	u32 new_hashmask;
-
-	old_divisor = dz->dz_divisor;
-
-	switch (old_divisor) {
-	case 16:
-		new_divisor = 256;
-		new_hashmask = 0xFF;
-		break;
-	default:
-		printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n",
-		       old_divisor);
-		fallthrough;
-	case 256:
-		new_divisor = 1024;
-		new_hashmask = 0x3FF;
-		break;
-	}
-
-	ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL);
-	if (ht == NULL)
-		return;
-
-	write_lock_bh(&dn_fib_tables_lock);
-	old_ht = dz->dz_hash;
-	dz->dz_hash = ht;
-	dz->dz_hashmask = new_hashmask;
-	dz->dz_divisor = new_divisor;
-	dn_rebuild_zone(dz, old_ht, old_divisor);
-	write_unlock_bh(&dn_fib_tables_lock);
-	kfree(old_ht);
-}
-
-static void dn_free_node(struct dn_fib_node *f)
-{
-	dn_fib_release_info(DN_FIB_INFO(f));
-	kmem_cache_free(dn_hash_kmem, f);
-}
-
-
-static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
-{
-	int i;
-	struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL);
-	if (!dz)
-		return NULL;
-
-	if (z) {
-		dz->dz_divisor = 16;
-		dz->dz_hashmask = 0x0F;
-	} else {
-		dz->dz_divisor = 1;
-		dz->dz_hashmask = 0;
-	}
-
-	dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL);
-	if (!dz->dz_hash) {
-		kfree(dz);
-		return NULL;
-	}
-
-	dz->dz_order = z;
-	dz->dz_mask = dnet_make_mask(z);
-
-	for(i = z + 1; i <= 16; i++)
-		if (table->dh_zones[i])
-			break;
-
-	write_lock_bh(&dn_fib_tables_lock);
-	if (i>16) {
-		dz->dz_next = table->dh_zone_list;
-		table->dh_zone_list = dz;
-	} else {
-		dz->dz_next = table->dh_zones[i]->dz_next;
-		table->dh_zones[i]->dz_next = dz;
-	}
-	table->dh_zones[z] = dz;
-	write_unlock_bh(&dn_fib_tables_lock);
-	return dz;
-}
-
-
-static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi)
-{
-	struct rtnexthop *nhp;
-	int nhlen;
-
-	if (attrs[RTA_PRIORITY] &&
-	    nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority)
-		return 1;
-
-	if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) {
-		if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) &&
-		    (!attrs[RTA_GATEWAY]  || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw))
-			return 0;
-		return 1;
-	}
-
-	if (!attrs[RTA_MULTIPATH])
-		return 0;
-
-	nhp = nla_data(attrs[RTA_MULTIPATH]);
-	nhlen = nla_len(attrs[RTA_MULTIPATH]);
-
-	for_nexthops(fi) {
-		int attrlen = nhlen - sizeof(struct rtnexthop);
-		__le16 gw;
-
-		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
-			return -EINVAL;
-		if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
-			return 1;
-		if (attrlen) {
-			struct nlattr *gw_attr;
-
-			gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
-			gw = gw_attr ? nla_get_le16(gw_attr) : 0;
-
-			if (gw && gw != nh->nh_gw)
-				return 1;
-		}
-		nhp = RTNH_NEXT(nhp);
-	} endfor_nexthops(fi);
-
-	return 0;
-}
-
-static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
-{
-	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
-			 + nla_total_size(4) /* RTA_TABLE */
-			 + nla_total_size(2) /* RTA_DST */
-			 + nla_total_size(4) /* RTA_PRIORITY */
-			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
-
-	/* space for nested metrics */
-	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
-
-	if (fi->fib_nhs) {
-		/* Also handles the special case fib_nhs == 1 */
-
-		/* each nexthop is packed in an attribute */
-		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
-
-		/* may contain a gateway attribute */
-		nhsize += nla_total_size(4);
-
-		/* all nexthops are packed in a nested attribute */
-		payload += nla_total_size(fi->fib_nhs * nhsize);
-	}
-
-	return payload;
-}
-
-static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
-			u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
-			struct dn_fib_info *fi, unsigned int flags)
-{
-	struct rtmsg *rtm;
-	struct nlmsghdr *nlh;
-
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	rtm = nlmsg_data(nlh);
-	rtm->rtm_family = AF_DECnet;
-	rtm->rtm_dst_len = dst_len;
-	rtm->rtm_src_len = 0;
-	rtm->rtm_tos = 0;
-	rtm->rtm_table = tb_id;
-	rtm->rtm_flags = fi->fib_flags;
-	rtm->rtm_scope = scope;
-	rtm->rtm_type  = type;
-	rtm->rtm_protocol = fi->fib_protocol;
-
-	if (nla_put_u32(skb, RTA_TABLE, tb_id) < 0)
-		goto errout;
-
-	if (rtm->rtm_dst_len &&
-	    nla_put(skb, RTA_DST, 2, dst) < 0)
-		goto errout;
-
-	if (fi->fib_priority &&
-	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority) < 0)
-		goto errout;
-
-	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
-		goto errout;
-
-	if (fi->fib_nhs == 1) {
-		if (fi->fib_nh->nh_gw &&
-		    nla_put_le16(skb, RTA_GATEWAY, fi->fib_nh->nh_gw) < 0)
-			goto errout;
-
-		if (fi->fib_nh->nh_oif &&
-		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif) < 0)
-			goto errout;
-	}
-
-	if (fi->fib_nhs > 1) {
-		struct rtnexthop *nhp;
-		struct nlattr *mp_head;
-
-		mp_head = nla_nest_start_noflag(skb, RTA_MULTIPATH);
-		if (!mp_head)
-			goto errout;
-
-		for_nexthops(fi) {
-			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp))))
-				goto errout;
-
-			nhp->rtnh_flags = nh->nh_flags & 0xFF;
-			nhp->rtnh_hops = nh->nh_weight - 1;
-			nhp->rtnh_ifindex = nh->nh_oif;
-
-			if (nh->nh_gw &&
-			    nla_put_le16(skb, RTA_GATEWAY, nh->nh_gw) < 0)
-				goto errout;
-
-			nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp;
-		} endfor_nexthops(fi);
-
-		nla_nest_end(skb, mp_head);
-	}
-
-	nlmsg_end(skb, nlh);
-	return 0;
-
-errout:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-
-static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
-			struct nlmsghdr *nlh, struct netlink_skb_parms *req)
-{
-	struct sk_buff *skb;
-	u32 portid = req ? req->portid : 0;
-	int err = -ENOBUFS;
-
-	skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL);
-	if (skb == NULL)
-		goto errout;
-
-	err = dn_fib_dump_info(skb, portid, nlh->nlmsg_seq, event, tb_id,
-			       f->fn_type, f->fn_scope, &f->fn_key, z,
-			       DN_FIB_INFO(f), 0);
-	if (err < 0) {
-		/* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */
-		WARN_ON(err == -EMSGSIZE);
-		kfree_skb(skb);
-		goto errout;
-	}
-	rtnl_notify(skb, &init_net, portid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
-	return;
-errout:
-	if (err < 0)
-		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
-}
-
-static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
-				struct netlink_callback *cb,
-				struct dn_fib_table *tb,
-				struct dn_zone *dz,
-				struct dn_fib_node *f)
-{
-	int i, s_i;
-
-	s_i = cb->args[4];
-	for(i = 0; f; i++, f = f->fn_next) {
-		if (i < s_i)
-			continue;
-		if (f->fn_state & DN_S_ZOMBIE)
-			continue;
-		if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
-				cb->nlh->nlmsg_seq,
-				RTM_NEWROUTE,
-				tb->n,
-				(f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
-				f->fn_scope, &f->fn_key, dz->dz_order,
-				f->fn_info, NLM_F_MULTI) < 0) {
-			cb->args[4] = i;
-			return -1;
-		}
-	}
-	cb->args[4] = i;
-	return skb->len;
-}
-
-static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
-				struct netlink_callback *cb,
-				struct dn_fib_table *tb,
-				struct dn_zone *dz)
-{
-	int h, s_h;
-
-	s_h = cb->args[3];
-	for(h = 0; h < dz->dz_divisor; h++) {
-		if (h < s_h)
-			continue;
-		if (h > s_h)
-			memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
-		if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
-			continue;
-		if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
-			cb->args[3] = h;
-			return -1;
-		}
-	}
-	cb->args[3] = h;
-	return skb->len;
-}
-
-static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
-				struct netlink_callback *cb)
-{
-	int m, s_m;
-	struct dn_zone *dz;
-	struct dn_hash *table = (struct dn_hash *)tb->data;
-
-	s_m = cb->args[2];
-	read_lock(&dn_fib_tables_lock);
-	for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
-		if (m < s_m)
-			continue;
-		if (m > s_m)
-			memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
-
-		if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
-			cb->args[2] = m;
-			read_unlock(&dn_fib_tables_lock);
-			return -1;
-		}
-	}
-	read_unlock(&dn_fib_tables_lock);
-	cb->args[2] = m;
-
-	return skb->len;
-}
-
-int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct net *net = sock_net(skb->sk);
-	unsigned int h, s_h;
-	unsigned int e = 0, s_e;
-	struct dn_fib_table *tb;
-	int dumped = 0;
-
-	if (!net_eq(net, &init_net))
-		return 0;
-
-	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
-		((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED)
-			return dn_cache_dump(skb, cb);
-
-	s_h = cb->args[0];
-	s_e = cb->args[1];
-
-	for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
-		e = 0;
-		hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) {
-			if (e < s_e)
-				goto next;
-			if (dumped)
-				memset(&cb->args[2], 0, sizeof(cb->args) -
-						 2 * sizeof(cb->args[0]));
-			if (tb->dump(tb, skb, cb) < 0)
-				goto out;
-			dumped = 1;
-next:
-			e++;
-		}
-	}
-out:
-	cb->args[1] = e;
-	cb->args[0] = h;
-
-	return skb->len;
-}
-
-static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
-			       struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
-	struct dn_hash *table = (struct dn_hash *)tb->data;
-	struct dn_fib_node *new_f, *f, **fp, **del_fp;
-	struct dn_zone *dz;
-	struct dn_fib_info *fi;
-	int z = r->rtm_dst_len;
-	int type = r->rtm_type;
-	dn_fib_key_t key;
-	int err;
-
-	if (z > 16)
-		return -EINVAL;
-
-	dz = table->dh_zones[z];
-	if (!dz && !(dz = dn_new_zone(table, z)))
-		return -ENOBUFS;
-
-	dz_key_0(key);
-	if (attrs[RTA_DST]) {
-		__le16 dst = nla_get_le16(attrs[RTA_DST]);
-		if (dst & ~DZ_MASK(dz))
-			return -EINVAL;
-		key = dz_key(dst, dz);
-	}
-
-	if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL)
-		return err;
-
-	if (dz->dz_nent > (dz->dz_divisor << 2) &&
-			dz->dz_divisor > DN_MAX_DIVISOR &&
-			(z==16 || (1<<z) > dz->dz_divisor))
-		dn_rehash_zone(dz);
-
-	fp = dn_chain_p(key, dz);
-
-	DN_FIB_SCAN(f, fp) {
-		if (dn_key_leq(key, f->fn_key))
-			break;
-	}
-
-	del_fp = NULL;
-
-	if (f && (f->fn_state & DN_S_ZOMBIE) &&
-			dn_key_eq(f->fn_key, key)) {
-		del_fp = fp;
-		fp = &f->fn_next;
-		f = *fp;
-		goto create;
-	}
-
-	DN_FIB_SCAN_KEY(f, fp, key) {
-		if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority)
-			break;
-	}
-
-	if (f && dn_key_eq(f->fn_key, key) &&
-			fi->fib_priority == DN_FIB_INFO(f)->fib_priority) {
-		struct dn_fib_node **ins_fp;
-
-		err = -EEXIST;
-		if (n->nlmsg_flags & NLM_F_EXCL)
-			goto out;
-
-		if (n->nlmsg_flags & NLM_F_REPLACE) {
-			del_fp = fp;
-			fp = &f->fn_next;
-			f = *fp;
-			goto replace;
-		}
-
-		ins_fp = fp;
-		err = -EEXIST;
-
-		DN_FIB_SCAN_KEY(f, fp, key) {
-			if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority)
-				break;
-			if (f->fn_type == type &&
-			    f->fn_scope == r->rtm_scope &&
-			    DN_FIB_INFO(f) == fi)
-				goto out;
-		}
-
-		if (!(n->nlmsg_flags & NLM_F_APPEND)) {
-			fp = ins_fp;
-			f = *fp;
-		}
-	}
-
-create:
-	err = -ENOENT;
-	if (!(n->nlmsg_flags & NLM_F_CREATE))
-		goto out;
-
-replace:
-	err = -ENOBUFS;
-	new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL);
-	if (new_f == NULL)
-		goto out;
-
-	new_f->fn_key = key;
-	new_f->fn_type = type;
-	new_f->fn_scope = r->rtm_scope;
-	DN_FIB_INFO(new_f) = fi;
-
-	new_f->fn_next = f;
-	write_lock_bh(&dn_fib_tables_lock);
-	*fp = new_f;
-	write_unlock_bh(&dn_fib_tables_lock);
-	dz->dz_nent++;
-
-	if (del_fp) {
-		f = *del_fp;
-		write_lock_bh(&dn_fib_tables_lock);
-		*del_fp = f->fn_next;
-		write_unlock_bh(&dn_fib_tables_lock);
-
-		if (!(f->fn_state & DN_S_ZOMBIE))
-			dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
-		if (f->fn_state & DN_S_ACCESSED)
-			dn_rt_cache_flush(-1);
-		dn_free_node(f);
-		dz->dz_nent--;
-	} else {
-		dn_rt_cache_flush(-1);
-	}
-
-	dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req);
-
-	return 0;
-out:
-	dn_fib_release_info(fi);
-	return err;
-}
-
-
-static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
-			       struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
-	struct dn_hash *table = (struct dn_hash*)tb->data;
-	struct dn_fib_node **fp, **del_fp, *f;
-	int z = r->rtm_dst_len;
-	struct dn_zone *dz;
-	dn_fib_key_t key;
-	int matched;
-
-
-	if (z > 16)
-		return -EINVAL;
-
-	if ((dz = table->dh_zones[z]) == NULL)
-		return -ESRCH;
-
-	dz_key_0(key);
-	if (attrs[RTA_DST]) {
-		__le16 dst = nla_get_le16(attrs[RTA_DST]);
-		if (dst & ~DZ_MASK(dz))
-			return -EINVAL;
-		key = dz_key(dst, dz);
-	}
-
-	fp = dn_chain_p(key, dz);
-
-	DN_FIB_SCAN(f, fp) {
-		if (dn_key_eq(f->fn_key, key))
-			break;
-		if (dn_key_leq(key, f->fn_key))
-			return -ESRCH;
-	}
-
-	matched = 0;
-	del_fp = NULL;
-	DN_FIB_SCAN_KEY(f, fp, key) {
-		struct dn_fib_info *fi = DN_FIB_INFO(f);
-
-		if (f->fn_state & DN_S_ZOMBIE)
-			return -ESRCH;
-
-		matched++;
-
-		if (del_fp == NULL &&
-				(!r->rtm_type || f->fn_type == r->rtm_type) &&
-				(r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
-				(!r->rtm_protocol ||
-					fi->fib_protocol == r->rtm_protocol) &&
-				dn_fib_nh_match(r, n, attrs, fi) == 0)
-			del_fp = fp;
-	}
-
-	if (del_fp) {
-		f = *del_fp;
-		dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
-
-		if (matched != 1) {
-			write_lock_bh(&dn_fib_tables_lock);
-			*del_fp = f->fn_next;
-			write_unlock_bh(&dn_fib_tables_lock);
-
-			if (f->fn_state & DN_S_ACCESSED)
-				dn_rt_cache_flush(-1);
-			dn_free_node(f);
-			dz->dz_nent--;
-		} else {
-			f->fn_state |= DN_S_ZOMBIE;
-			if (f->fn_state & DN_S_ACCESSED) {
-				f->fn_state &= ~DN_S_ACCESSED;
-				dn_rt_cache_flush(-1);
-			}
-			if (++dn_fib_hash_zombies > 128)
-				dn_fib_flush();
-		}
-
-		return 0;
-	}
-
-	return -ESRCH;
-}
-
-static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table)
-{
-	int found = 0;
-	struct dn_fib_node *f;
-
-	while((f = *fp) != NULL) {
-		struct dn_fib_info *fi = DN_FIB_INFO(f);
-
-		if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) {
-			write_lock_bh(&dn_fib_tables_lock);
-			*fp = f->fn_next;
-			write_unlock_bh(&dn_fib_tables_lock);
-
-			dn_free_node(f);
-			found++;
-			continue;
-		}
-		fp = &f->fn_next;
-	}
-
-	return found;
-}
-
-static int dn_fib_table_flush(struct dn_fib_table *tb)
-{
-	struct dn_hash *table = (struct dn_hash *)tb->data;
-	struct dn_zone *dz;
-	int found = 0;
-
-	dn_fib_hash_zombies = 0;
-	for(dz = table->dh_zone_list; dz; dz = dz->dz_next) {
-		int i;
-		int tmp = 0;
-		for(i = dz->dz_divisor-1; i >= 0; i--)
-			tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table);
-		dz->dz_nent -= tmp;
-		found += tmp;
-	}
-
-	return found;
-}
-
-static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res)
-{
-	int err;
-	struct dn_zone *dz;
-	struct dn_hash *t = (struct dn_hash *)tb->data;
-
-	read_lock(&dn_fib_tables_lock);
-	for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
-		struct dn_fib_node *f;
-		dn_fib_key_t k = dz_key(flp->daddr, dz);
-
-		for(f = dz_chain(k, dz); f; f = f->fn_next) {
-			if (!dn_key_eq(k, f->fn_key)) {
-				if (dn_key_leq(k, f->fn_key))
-					break;
-				else
-					continue;
-			}
-
-			f->fn_state |= DN_S_ACCESSED;
-
-			if (f->fn_state&DN_S_ZOMBIE)
-				continue;
-
-			if (f->fn_scope < flp->flowidn_scope)
-				continue;
-
-			err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
-
-			if (err == 0) {
-				res->type = f->fn_type;
-				res->scope = f->fn_scope;
-				res->prefixlen = dz->dz_order;
-				goto out;
-			}
-			if (err < 0)
-				goto out;
-		}
-	}
-	err = 1;
-out:
-	read_unlock(&dn_fib_tables_lock);
-	return err;
-}
-
-
-struct dn_fib_table *dn_fib_get_table(u32 n, int create)
-{
-	struct dn_fib_table *t;
-	unsigned int h;
-
-	if (n < RT_TABLE_MIN)
-		return NULL;
-
-	if (n > RT_TABLE_MAX)
-		return NULL;
-
-	h = n & (DN_FIB_TABLE_HASHSZ - 1);
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(t, &dn_fib_table_hash[h], hlist) {
-		if (t->n == n) {
-			rcu_read_unlock();
-			return t;
-		}
-	}
-	rcu_read_unlock();
-
-	if (!create)
-		return NULL;
-
-	if (in_interrupt()) {
-		net_dbg_ratelimited("DECnet: BUG! Attempt to create routing table from interrupt\n");
-		return NULL;
-	}
-
-	t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash),
-		    GFP_KERNEL);
-	if (t == NULL)
-		return NULL;
-
-	t->n = n;
-	t->insert = dn_fib_table_insert;
-	t->delete = dn_fib_table_delete;
-	t->lookup = dn_fib_table_lookup;
-	t->flush  = dn_fib_table_flush;
-	t->dump = dn_fib_table_dump;
-	hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
-
-	return t;
-}
-
-struct dn_fib_table *dn_fib_empty_table(void)
-{
-	u32 id;
-
-	for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
-		if (dn_fib_get_table(id, 0) == NULL)
-			return dn_fib_get_table(id, 1);
-	return NULL;
-}
-
-void dn_fib_flush(void)
-{
-	int flushed = 0;
-	struct dn_fib_table *tb;
-	unsigned int h;
-
-	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
-		hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist)
-			flushed += tb->flush(tb);
-	}
-
-	if (flushed)
-		dn_rt_cache_flush(-1);
-}
-
-void __init dn_fib_table_init(void)
-{
-	dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
-					sizeof(struct dn_fib_info),
-					0, SLAB_HWCACHE_ALIGN,
-					NULL);
-}
-
-void __exit dn_fib_table_cleanup(void)
-{
-	struct dn_fib_table *t;
-	struct hlist_node *next;
-	unsigned int h;
-
-	write_lock(&dn_fib_tables_lock);
-	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
-		hlist_for_each_entry_safe(t, next, &dn_fib_table_hash[h],
-					  hlist) {
-			hlist_del(&t->hlist);
-			kfree(t);
-		}
-	}
-	write_unlock(&dn_fib_tables_lock);
-}
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
deleted file mode 100644
index aa4155875ca8..000000000000
--- a/net/decnet/dn_timer.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Socket Timer Functions
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- *       Steve Whitehouse      : Made keepalive timer part of the same
- *                               timer idea.
- *       Steve Whitehouse      : Added checks for sk->sock_readers
- *       David S. Miller       : New socket locking
- *       Steve Whitehouse      : Timer grabs socket ref.
- */
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <net/sock.h>
-#include <linux/atomic.h>
-#include <linux/jiffies.h>
-#include <net/flow.h>
-#include <net/dn.h>
-
-/*
- * Slow timer is for everything else (n * 500mS)
- */
-
-#define SLOW_INTERVAL (HZ/2)
-
-static void dn_slow_timer(struct timer_list *t);
-
-void dn_start_slow_timer(struct sock *sk)
-{
-	timer_setup(&sk->sk_timer, dn_slow_timer, 0);
-	sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
-}
-
-void dn_stop_slow_timer(struct sock *sk)
-{
-	sk_stop_timer(sk, &sk->sk_timer);
-}
-
-static void dn_slow_timer(struct timer_list *t)
-{
-	struct sock *sk = from_timer(sk, t, sk_timer);
-	struct dn_scp *scp = DN_SK(sk);
-
-	bh_lock_sock(sk);
-
-	if (sock_owned_by_user(sk)) {
-		sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 10);
-		goto out;
-	}
-
-	/*
-	 * The persist timer is the standard slow timer used for retransmits
-	 * in both connection establishment and disconnection as well as
-	 * in the RUN state. The different states are catered for by changing
-	 * the function pointer in the socket. Setting the timer to a value
-	 * of zero turns it off. We allow the persist_fxn to turn the
-	 * timer off in a permant way by returning non-zero, so that
-	 * timer based routines may remove sockets. This is why we have a
-	 * sock_hold()/sock_put() around the timer to prevent the socket
-	 * going away in the middle.
-	 */
-	if (scp->persist && scp->persist_fxn) {
-		if (scp->persist <= SLOW_INTERVAL) {
-			scp->persist = 0;
-
-			if (scp->persist_fxn(sk))
-				goto out;
-		} else {
-			scp->persist -= SLOW_INTERVAL;
-		}
-	}
-
-	/*
-	 * Check for keepalive timeout. After the other timer 'cos if
-	 * the previous timer caused a retransmit, we don't need to
-	 * do this. scp->stamp is the last time that we sent a packet.
-	 * The keepalive function sends a link service packet to the
-	 * other end. If it remains unacknowledged, the standard
-	 * socket timers will eventually shut the socket down. Each
-	 * time we do this, scp->stamp will be updated, thus
-	 * we won't try and send another until scp->keepalive has passed
-	 * since the last successful transmission.
-	 */
-	if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) {
-		if (time_after_eq(jiffies, scp->stamp + scp->keepalive))
-			scp->keepalive_fxn(sk);
-	}
-
-	sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
-out:
-	bh_unlock_sock(sk);
-	sock_put(sk);
-}
diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig
deleted file mode 100644
index 14ec4ef95fab..000000000000
--- a/net/decnet/netfilter/Kconfig
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# DECnet netfilter configuration
-#
-
-menu "DECnet: Netfilter Configuration"
-	depends on DECNET && NETFILTER
-	depends on NETFILTER_ADVANCED
-
-config DECNET_NF_GRABULATOR
-	tristate "Routing message grabulator (for userland routing daemon)"
-	help
-	  Enable this module if you want to use the userland DECnet routing
-	  daemon. You will also need to enable routing support for DECnet
-	  unless you just want to monitor routing messages from other nodes.
-
-endmenu
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
deleted file mode 100644
index 429c84289d0f..000000000000
--- a/net/decnet/netfilter/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for DECnet netfilter modules
-#
-
-obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
deleted file mode 100644
index 26a9193df783..000000000000
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet Routing Message Grabulator
- *
- *              (C) 2000 ChyGwyn Limited  -  https://www.chygwyn.com/
- *
- * Author:      Steven Whitehouse <steve@chygwyn.com>
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/spinlock.h>
-#include <net/netlink.h>
-#include <linux/netfilter_decnet.h>
-
-#include <net/sock.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-
-static struct sock *dnrmg = NULL;
-
-
-static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
-{
-	struct sk_buff *skb = NULL;
-	size_t size;
-	sk_buff_data_t old_tail;
-	struct nlmsghdr *nlh;
-	unsigned char *ptr;
-	struct nf_dn_rtmsg *rtm;
-
-	size = NLMSG_ALIGN(rt_skb->len) +
-	       NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
-	skb = nlmsg_new(size, GFP_ATOMIC);
-	if (!skb) {
-		*errp = -ENOMEM;
-		return NULL;
-	}
-	old_tail = skb->tail;
-	nlh = nlmsg_put(skb, 0, 0, 0, size, 0);
-	if (!nlh) {
-		kfree_skb(skb);
-		*errp = -ENOMEM;
-		return NULL;
-	}
-	rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh);
-	rtm->nfdn_ifindex = rt_skb->dev->ifindex;
-	ptr = NFDN_RTMSG(rtm);
-	skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len);
-	nlh->nlmsg_len = skb->tail - old_tail;
-	return skb;
-}
-
-static void dnrmg_send_peer(struct sk_buff *skb)
-{
-	struct sk_buff *skb2;
-	int status = 0;
-	int group = 0;
-	unsigned char flags = *skb->data;
-
-	switch (flags & DN_RT_CNTL_MSK) {
-	case DN_RT_PKT_L1RT:
-		group = DNRNG_NLGRP_L1;
-		break;
-	case DN_RT_PKT_L2RT:
-		group = DNRNG_NLGRP_L2;
-		break;
-	default:
-		return;
-	}
-
-	skb2 = dnrmg_build_message(skb, &status);
-	if (skb2 == NULL)
-		return;
-	NETLINK_CB(skb2).dst_group = group;
-	netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
-}
-
-
-static unsigned int dnrmg_hook(void *priv,
-			struct sk_buff *skb,
-			const struct nf_hook_state *state)
-{
-	dnrmg_send_peer(skb);
-	return NF_ACCEPT;
-}
-
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0)
-
-static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
-{
-	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-
-	if (skb->len < sizeof(*nlh) ||
-	    nlh->nlmsg_len < sizeof(*nlh) ||
-	    skb->len < nlh->nlmsg_len)
-		return;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		RCV_SKB_FAIL(-EPERM);
-
-	/* Eventually we might send routing messages too */
-
-	RCV_SKB_FAIL(-EINVAL);
-}
-
-static const struct nf_hook_ops dnrmg_ops = {
-	.hook		= dnrmg_hook,
-	.pf		= NFPROTO_DECNET,
-	.hooknum	= NF_DN_ROUTE,
-	.priority	= NF_DN_PRI_DNRTMSG,
-};
-
-static int __init dn_rtmsg_init(void)
-{
-	int rv = 0;
-	struct netlink_kernel_cfg cfg = {
-		.groups	= DNRNG_NLGRP_MAX,
-		.input	= dnrmg_receive_user_skb,
-	};
-
-	dnrmg = netlink_kernel_create(&init_net, NETLINK_DNRTMSG, &cfg);
-	if (dnrmg == NULL) {
-		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
-		return -ENOMEM;
-	}
-
-	rv = nf_register_net_hook(&init_net, &dnrmg_ops);
-	if (rv) {
-		netlink_kernel_release(dnrmg);
-	}
-
-	return rv;
-}
-
-static void __exit dn_rtmsg_fini(void)
-{
-	nf_unregister_net_hook(&init_net, &dnrmg_ops);
-	netlink_kernel_release(dnrmg);
-}
-
-
-MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
-MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
-
-module_init(dn_rtmsg_init);
-module_exit(dn_rtmsg_fini);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
deleted file mode 100644
index 67b5ab2657b7..000000000000
--- a/net/decnet/sysctl_net_decnet.c
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet       An implementation of the DECnet protocol suite for the LINUX
- *              operating system.  DECnet is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              DECnet sysctl support functions
- *
- * Author:      Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse - C99 changes and default device handling
- * Steve Whitehouse - Memory buffer settings, like the tcp ones
- *
- */
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-
-#include <linux/uaccess.h>
-
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-int decnet_debug_level;
-int decnet_time_wait = 30;
-int decnet_dn_count = 1;
-int decnet_di_count = 3;
-int decnet_dr_count = 3;
-int decnet_log_martians = 1;
-int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
-
-/* Reasonable defaults, I hope, based on tcp's defaults */
-long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
-int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
-int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
-
-#ifdef CONFIG_SYSCTL
-extern int decnet_dst_gc_interval;
-static int min_decnet_time_wait[] = { 5 };
-static int max_decnet_time_wait[] = { 600 };
-static int min_state_count[] = { 1 };
-static int max_state_count[] = { NSP_MAXRXTSHIFT };
-static int min_decnet_dst_gc_interval[] = { 1 };
-static int max_decnet_dst_gc_interval[] = { 60 };
-static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW };
-static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW };
-static char node_name[7] = "???";
-
-static struct ctl_table_header *dn_table_header = NULL;
-
-/*
- * ctype.h :-)
- */
-#define ISNUM(x) (((x) >= '0') && ((x) <= '9'))
-#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z'))
-#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z'))
-#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x))
-#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x))
-
-static void strip_it(char *str)
-{
-	for(;;) {
-		switch (*str) {
-		case ' ':
-		case '\n':
-		case '\r':
-		case ':':
-			*str = 0;
-			fallthrough;
-		case 0:
-			return;
-		}
-		str++;
-	}
-}
-
-/*
- * Simple routine to parse an ascii DECnet address
- * into a network order address.
- */
-static int parse_addr(__le16 *addr, char *str)
-{
-	__u16 area, node;
-
-	while(*str && !ISNUM(*str)) str++;
-
-	if (*str == 0)
-		return -1;
-
-	area = (*str++ - '0');
-	if (ISNUM(*str)) {
-		area *= 10;
-		area += (*str++ - '0');
-	}
-
-	if (*str++ != '.')
-		return -1;
-
-	if (!ISNUM(*str))
-		return -1;
-
-	node = *str++ - '0';
-	if (ISNUM(*str)) {
-		node *= 10;
-		node += (*str++ - '0');
-	}
-	if (ISNUM(*str)) {
-		node *= 10;
-		node += (*str++ - '0');
-	}
-	if (ISNUM(*str)) {
-		node *= 10;
-		node += (*str++ - '0');
-	}
-
-	if ((node > 1023) || (area > 63))
-		return -1;
-
-	if (INVALID_END_CHAR(*str))
-		return -1;
-
-	*addr = cpu_to_le16((area << 10) | node);
-
-	return 0;
-}
-
-static int dn_node_address_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	char addr[DN_ASCBUF_LEN];
-	size_t len;
-	__le16 dnaddr;
-
-	if (!*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
-	}
-
-	if (write) {
-		len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
-		memcpy(addr, buffer, len);
-		addr[len] = 0;
-		strip_it(addr);
-
-		if (parse_addr(&dnaddr, addr))
-			return -EINVAL;
-
-		dn_dev_devices_off();
-
-		decnet_address = dnaddr;
-
-		dn_dev_devices_on();
-
-		*ppos += len;
-
-		return 0;
-	}
-
-	dn_addr2asc(le16_to_cpu(decnet_address), addr);
-	len = strlen(addr);
-	addr[len++] = '\n';
-
-	if (len > *lenp)
-		len = *lenp;
-	memcpy(buffer, addr, len);
-	*lenp = len;
-	*ppos += len;
-
-	return 0;
-}
-
-static int dn_def_dev_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
-{
-	size_t len;
-	struct net_device *dev;
-	char devname[17];
-
-	if (!*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
-	}
-
-	if (write) {
-		if (*lenp > 16)
-			return -E2BIG;
-
-		memcpy(devname, buffer, *lenp);
-		devname[*lenp] = 0;
-		strip_it(devname);
-
-		dev = dev_get_by_name(&init_net, devname);
-		if (dev == NULL)
-			return -ENODEV;
-
-		if (dev->dn_ptr == NULL) {
-			dev_put(dev);
-			return -ENODEV;
-		}
-
-		if (dn_dev_set_default(dev, 1)) {
-			dev_put(dev);
-			return -ENODEV;
-		}
-		*ppos += *lenp;
-
-		return 0;
-	}
-
-	dev = dn_dev_get_default();
-	if (dev == NULL) {
-		*lenp = 0;
-		return 0;
-	}
-
-	strcpy(devname, dev->name);
-	dev_put(dev);
-	len = strlen(devname);
-	devname[len++] = '\n';
-
-	if (len > *lenp) len = *lenp;
-
-	memcpy(buffer, devname, len);
-	*lenp = len;
-	*ppos += len;
-
-	return 0;
-}
-
-static struct ctl_table dn_table[] = {
-	{
-		.procname = "node_address",
-		.maxlen = 7,
-		.mode = 0644,
-		.proc_handler = dn_node_address_handler,
-	},
-	{
-		.procname = "node_name",
-		.data = node_name,
-		.maxlen = 7,
-		.mode = 0644,
-		.proc_handler = proc_dostring,
-	},
-	{
-		.procname = "default_device",
-		.maxlen = 16,
-		.mode = 0644,
-		.proc_handler = dn_def_dev_handler,
-	},
-	{
-		.procname = "time_wait",
-		.data = &decnet_time_wait,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_decnet_time_wait,
-		.extra2 = &max_decnet_time_wait
-	},
-	{
-		.procname = "dn_count",
-		.data = &decnet_dn_count,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_state_count,
-		.extra2 = &max_state_count
-	},
-	{
-		.procname = "di_count",
-		.data = &decnet_di_count,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_state_count,
-		.extra2 = &max_state_count
-	},
-	{
-		.procname = "dr_count",
-		.data = &decnet_dr_count,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_state_count,
-		.extra2 = &max_state_count
-	},
-	{
-		.procname = "dst_gc_interval",
-		.data = &decnet_dst_gc_interval,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_decnet_dst_gc_interval,
-		.extra2 = &max_decnet_dst_gc_interval
-	},
-	{
-		.procname = "no_fc_max_cwnd",
-		.data = &decnet_no_fc_max_cwnd,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec_minmax,
-		.extra1 = &min_decnet_no_fc_max_cwnd,
-		.extra2 = &max_decnet_no_fc_max_cwnd
-	},
-       {
-		.procname = "decnet_mem",
-		.data = &sysctl_decnet_mem,
-		.maxlen = sizeof(sysctl_decnet_mem),
-		.mode = 0644,
-		.proc_handler = proc_doulongvec_minmax
-	},
-	{
-		.procname = "decnet_rmem",
-		.data = &sysctl_decnet_rmem,
-		.maxlen = sizeof(sysctl_decnet_rmem),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{
-		.procname = "decnet_wmem",
-		.data = &sysctl_decnet_wmem,
-		.maxlen = sizeof(sysctl_decnet_wmem),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{
-		.procname = "debug",
-		.data = &decnet_debug_level,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{ }
-};
-
-void dn_register_sysctl(void)
-{
-	dn_table_header = register_net_sysctl(&init_net, "net/decnet", dn_table);
-}
-
-void dn_unregister_sysctl(void)
-{
-	unregister_net_sysctl_table(dn_table_header);
-}
-
-#else  /* CONFIG_SYSCTL */
-void dn_unregister_sysctl(void)
-{
-}
-void dn_register_sysctl(void)
-{
-}
-
-#endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index dcf752b55a52..5a6705a0e4ec 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -300,12 +300,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
 			return NULL;
 		return net->nf.hooks_ipv6 + hooknum;
-#if IS_ENABLED(CONFIG_DECNET)
-	case NFPROTO_DECNET:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
-			return NULL;
-		return net->nf.hooks_decnet + hooknum;
-#endif
 	default:
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -750,10 +744,6 @@ static int __net_init netfilter_net_init(struct net *net)
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
 #endif
-#if IS_ENABLED(CONFIG_DECNET)
-	__netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
-#endif
-
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index 71e29adac48b..8120aadf6a0f 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -215,13 +215,6 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de
 		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
 #endif
 		break;
-#if IS_ENABLED(CONFIG_DECNET)
-	case NFPROTO_DECNET:
-		if (hook >= ARRAY_SIZE(net->nf.hooks_decnet))
-			return ERR_PTR(-EINVAL);
-		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
-		break;
-#endif
 #if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
 	case NFPROTO_NETDEV:
 		if (hook >= NF_NETDEV_NUMHOOKS)
-- 
cgit v1.2.3


From c6ea70604249bc357ce09e9f8e16c29df0fb2fa2 Mon Sep 17 00:00:00 2001
From: "dougmill@linux.vnet.ibm.com" <dougmill@linux.vnet.ibm.com>
Date: Tue, 16 Aug 2022 15:07:13 +0100
Subject: block: sed-opal: Add ioctl to return device status

Provide a mechanism to retrieve basic status information about
the device, including the "supported" flag indicating whether
SED-OPAL is supported. The information returned is from the various
feature descriptors received during the discovery0 step, and so
this ioctl does nothing more than perform the discovery0 step
and then save the information received. See "struct opal_status"
and OPAL_FL_* bits for the status information currently returned.

This is necessary to be able to check whether a device is OPAL
enabled, set up, locked or unlocked from userspace programs
like systemd-cryptsetup and libcryptsetup. Right now we just
have to assume the user 'knows' or blindly attempt setup/lock/unlock
operations.

Signed-off-by: Douglas Miller <dougmill@linux.vnet.ibm.com>
Tested-by: Luca Boccassi <bluca@debian.org>
Reviewed-by: Scott Bauer <sbauer@plzdonthack.me>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20220816140713.84893-1-luca.boccassi@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/opal_proto.h            |  5 +++
 block/sed-opal.c              | 89 +++++++++++++++++++++++++++++++++++++------
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h | 13 +++++++
 4 files changed, 96 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/block/opal_proto.h b/block/opal_proto.h
index b486b3ec7dc4..7152aa1f1a49 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -39,7 +39,12 @@ enum opal_response_token {
 #define FIRST_TPER_SESSION_NUM	4096
 
 #define TPER_SYNC_SUPPORTED 0x01
+/* FC_LOCKING features */
+#define LOCKING_SUPPORTED_MASK 0x01
+#define LOCKING_ENABLED_MASK 0x02
+#define LOCKED_MASK 0x04
 #define MBR_ENABLED_MASK 0x10
+#define MBR_DONE_MASK 0x20
 
 #define TINY_ATOM_DATA_MASK 0x3F
 #define TINY_ATOM_SIGNED 0x40
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 9700197000f2..2c5327a0543a 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -74,8 +74,7 @@ struct parsed_resp {
 };
 
 struct opal_dev {
-	bool supported;
-	bool mbr_enabled;
+	u32 flags;
 
 	void *data;
 	sec_send_recv *send_recv;
@@ -280,6 +279,30 @@ static bool check_tper(const void *data)
 	return true;
 }
 
+static bool check_lcksuppt(const void *data)
+{
+	const struct d0_locking_features *lfeat = data;
+	u8 sup_feat = lfeat->supported_features;
+
+	return !!(sup_feat & LOCKING_SUPPORTED_MASK);
+}
+
+static bool check_lckenabled(const void *data)
+{
+	const struct d0_locking_features *lfeat = data;
+	u8 sup_feat = lfeat->supported_features;
+
+	return !!(sup_feat & LOCKING_ENABLED_MASK);
+}
+
+static bool check_locked(const void *data)
+{
+	const struct d0_locking_features *lfeat = data;
+	u8 sup_feat = lfeat->supported_features;
+
+	return !!(sup_feat & LOCKED_MASK);
+}
+
 static bool check_mbrenabled(const void *data)
 {
 	const struct d0_locking_features *lfeat = data;
@@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data)
 	return !!(sup_feat & MBR_ENABLED_MASK);
 }
 
+static bool check_mbrdone(const void *data)
+{
+	const struct d0_locking_features *lfeat = data;
+	u8 sup_feat = lfeat->supported_features;
+
+	return !!(sup_feat & MBR_DONE_MASK);
+}
+
 static bool check_sum(const void *data)
 {
 	const struct d0_single_user_mode *sum = data;
@@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
 	u32 hlen = be32_to_cpu(hdr->length);
 
 	print_buffer(dev->resp, hlen);
-	dev->mbr_enabled = false;
+	dev->flags &= OPAL_FL_SUPPORTED;
 
 	if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
 		pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
@@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev)
 			check_geometry(dev, body);
 			break;
 		case FC_LOCKING:
-			dev->mbr_enabled = check_mbrenabled(body->features);
+			if (check_lcksuppt(body->features))
+				dev->flags |= OPAL_FL_LOCKING_SUPPORTED;
+			if (check_lckenabled(body->features))
+				dev->flags |= OPAL_FL_LOCKING_ENABLED;
+			if (check_locked(body->features))
+				dev->flags |= OPAL_FL_LOCKED;
+			if (check_mbrenabled(body->features))
+				dev->flags |= OPAL_FL_MBR_ENABLED;
+			if (check_mbrdone(body->features))
+				dev->flags |= OPAL_FL_MBR_DONE;
 			break;
 		case FC_ENTERPRISE:
 		case FC_DATASTORE:
@@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev)
 	mutex_lock(&dev->dev_lock);
 	setup_opal_dev(dev);
 	ret = opal_discovery0_step(dev);
-	dev->supported = !ret;
+	if (!ret)
+		dev->flags |= OPAL_FL_SUPPORTED;
 	mutex_unlock(&dev->dev_lock);
 
 	return ret;
@@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 
 	INIT_LIST_HEAD(&dev->unlk_lst);
 	mutex_init(&dev->dev_lock);
+	dev->flags = 0;
 	dev->data = data;
 	dev->send_recv = send_recv;
 	if (check_opal_support(dev) != 0) {
@@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 	if (!dev)
 		return false;
 
-	if (!dev->supported)
+	if (!(dev->flags & OPAL_FL_SUPPORTED))
 		return false;
 
 	mutex_lock(&dev->dev_lock);
@@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 			was_failure = true;
 		}
 
-		if (dev->mbr_enabled) {
+		if (dev->flags & OPAL_FL_MBR_ENABLED) {
 			ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
 			if (ret)
 				pr_debug("Failed to set MBR Done in S3 resume\n");
@@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
 	return ret;
 }
 
+static int opal_get_status(struct opal_dev *dev, void __user *data)
+{
+	struct opal_status sts = {0};
+
+	/*
+	 * check_opal_support() error is not fatal,
+	 * !dev->supported is a valid condition
+	 */
+	if (!check_opal_support(dev))
+		sts.flags = dev->flags;
+	if (copy_to_user(data, &sts, sizeof(sts))) {
+		pr_debug("Error copying status to userspace\n");
+		return -EFAULT;
+	}
+	return 0;
+}
+
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 {
 	void *p;
@@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 		return -EACCES;
 	if (!dev)
 		return -ENOTSUPP;
-	if (!dev->supported)
+	if (!(dev->flags & OPAL_FL_SUPPORTED))
 		return -ENOTSUPP;
 
-	p = memdup_user(arg, _IOC_SIZE(cmd));
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	if (cmd & IOC_IN) {
+		p = memdup_user(arg, _IOC_SIZE(cmd));
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+	}
 
 	switch (cmd) {
 	case IOC_OPAL_SAVE:
@@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_GENERIC_TABLE_RW:
 		ret = opal_generic_read_write_table(dev, p);
 		break;
+	case IOC_OPAL_GET_STATUS:
+		ret = opal_get_status(dev, arg);
+		break;
 	default:
 		break;
 	}
 
-	kfree(p);
+	if (cmd & IOC_IN)
+		kfree(p);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sed_ioctl);
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 1ac0d712a9c3..6f837bb6c715 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -43,6 +43,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_MBR_DONE:
 	case IOC_OPAL_WRITE_SHADOW_MBR:
 	case IOC_OPAL_GENERIC_TABLE_RW:
+	case IOC_OPAL_GET_STATUS:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 6f5af1a84213..2573772e2fb3 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -132,6 +132,18 @@ struct opal_read_write_table {
 	__u64 priv;
 };
 
+#define OPAL_FL_SUPPORTED		0x00000001
+#define OPAL_FL_LOCKING_SUPPORTED	0x00000002
+#define OPAL_FL_LOCKING_ENABLED		0x00000004
+#define OPAL_FL_LOCKED			0x00000008
+#define OPAL_FL_MBR_ENABLED		0x00000010
+#define OPAL_FL_MBR_DONE		0x00000020
+
+struct opal_status {
+	__u32 flags;
+	__u32 reserved;
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -148,5 +160,6 @@ struct opal_read_write_table {
 #define IOC_OPAL_MBR_DONE           _IOW('p', 233, struct opal_mbr_done)
 #define IOC_OPAL_WRITE_SHADOW_MBR   _IOW('p', 234, struct opal_shadow_mbr)
 #define IOC_OPAL_GENERIC_TABLE_RW   _IOW('p', 235, struct opal_read_write_table)
+#define IOC_OPAL_GET_STATUS         _IOR('p', 236, struct opal_status)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From e1d0c6d05afdcff01ace698edb3b8808db1dc066 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Tue, 23 Aug 2022 18:45:49 +0700
Subject: io_uring: uapi: Add `extern "C"` in io_uring.h for liburing

Make it easy for liburing to integrate uapi header with the kernel.
Previously, when this header changes, the liburing side can't directly
copy this header file due to some small differences. Sync them.

Link: https://lore.kernel.org/io-uring/f1feef16-6ea2-0653-238f-4aaee35060b6@kernel.dk
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: Facebook Kernel Team <kernel-team@fb.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..9e0b5c8d92ce 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -12,6 +12,10 @@
 #include <linux/types.h>
 #include <linux/time_types.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*
  * IO submission data structure (Submission Queue Entry)
  */
@@ -661,4 +665,8 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 12cda13cfd5310bbfefdfe32a82489228e2e0381 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:25 -0400
Subject: fs: dlm: remove DLM_LSFL_FS from uapi

The DLM_LSFL_FS flag is set in lockspaces created directly
for a kernel user, as opposed to those lockspaces created
for user space applications.  The user space libdlm allowed
this flag to be set for lockspaces created from user space,
but then used by a kernel user.  No kernel user has ever
used this method, so remove the ability to do it.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 drivers/md/md-cluster.c  |  4 ++--
 fs/dlm/lockspace.c       | 28 ++++++++++++++++++++++++----
 fs/dlm/lockspace.h       | 13 +++++++++++++
 fs/dlm/user.c            |  6 +++---
 fs/gfs2/lock_dlm.c       |  2 +-
 fs/ocfs2/stack_user.c    |  2 +-
 include/linux/dlm.h      |  3 ---
 include/uapi/linux/dlm.h |  1 -
 8 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 742b2349fea3..10e0c5381d01 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -876,8 +876,8 @@ static int join(struct mddev *mddev, int nodes)
 	memset(str, 0, 64);
 	sprintf(str, "%pU", mddev->uuid);
 	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
-				DLM_LSFL_FS, LVB_SIZE,
-				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
+				0, LVB_SIZE, &md_ls_ops, mddev,
+				&ops_rv, &cinfo->lockspace);
 	if (ret)
 		goto err;
 	wait_for_completion(&cinfo->completion);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 41a6504cfab5..bae050df7abf 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -703,10 +703,11 @@ static int new_lockspace(const char *name, const char *cluster,
 	return error;
 }
 
-int dlm_new_lockspace(const char *name, const char *cluster,
-		      uint32_t flags, int lvblen,
-		      const struct dlm_lockspace_ops *ops, void *ops_arg,
-		      int *ops_result, dlm_lockspace_t **lockspace)
+static int __dlm_new_lockspace(const char *name, const char *cluster,
+			       uint32_t flags, int lvblen,
+			       const struct dlm_lockspace_ops *ops,
+			       void *ops_arg, int *ops_result,
+			       dlm_lockspace_t **lockspace)
 {
 	int error = 0;
 
@@ -732,6 +733,25 @@ int dlm_new_lockspace(const char *name, const char *cluster,
 	return error;
 }
 
+int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags,
+		      int lvblen, const struct dlm_lockspace_ops *ops,
+		      void *ops_arg, int *ops_result,
+		      dlm_lockspace_t **lockspace)
+{
+	return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen,
+				   ops, ops_arg, ops_result, lockspace);
+}
+
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+			   uint32_t flags, int lvblen,
+			   const struct dlm_lockspace_ops *ops,
+			   void *ops_arg, int *ops_result,
+			   dlm_lockspace_t **lockspace)
+{
+	return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
+				   ops_arg, ops_result, lockspace);
+}
+
 static int lkb_idr_is_local(int id, void *p, void *data)
 {
 	struct dlm_lkb *lkb = p;
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 306fc4f4ea15..03f4a4a3a871 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -12,6 +12,14 @@
 #ifndef __LOCKSPACE_DOT_H__
 #define __LOCKSPACE_DOT_H__
 
+/* DLM_LSFL_FS
+ *   The lockspace user is in the kernel (i.e. filesystem).  Enables
+ *   direct bast/cast callbacks.
+ *
+ * internal lockspace flag - will be removed in future
+ */
+#define DLM_LSFL_FS	0x00000004
+
 int dlm_lockspace_init(void);
 void dlm_lockspace_exit(void);
 struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
@@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor);
 void dlm_put_lockspace(struct dlm_ls *ls);
 void dlm_stop_lockspaces(void);
 void dlm_stop_lockspaces_check(void);
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+			   uint32_t flags, int lvblen,
+			   const struct dlm_lockspace_ops *ops,
+			   void *ops_arg, int *ops_result,
+			   dlm_lockspace_t **lockspace);
 
 #endif				/* __LOCKSPACE_DOT_H__ */
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ca27f276a3f5..c5d27bccc3dc 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -423,9 +423,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
-				  DLM_USER_LVB_LEN, NULL, NULL, NULL,
-				  &lockspace);
+	error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name,
+				       params->flags, DLM_USER_LVB_LEN, NULL,
+				       NULL, NULL, &lockspace);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6ce369b096d4..71911bf9ab34 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	memcpy(cluster, table, strlen(table) - strlen(fsname));
 	fsname++;
 
-	flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+	flags = DLM_LSFL_NEWEXCL;
 
 	/*
 	 * create/join lockspace
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a75e2b7d67f5..64e6ddcfe329 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 	lc->oc_type = NO_CONTROLD;
 
 	rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
-			       DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+			       DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
 			       &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
 	if (rc) {
 		if (rc == -EEXIST || rc == -EPROTO)
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index ff951e9f6f20..f5f55c2138ae 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -56,9 +56,6 @@ struct dlm_lockspace_ops {
  * DLM_LSFL_TIMEWARN
  *   The dlm should emit netlink messages if locks have been waiting
  *   for a configurable amount of time.  (Unused.)
- * DLM_LSFL_FS
- *   The lockspace user is in the kernel (i.e. filesystem).  Enables
- *   direct bast/cast callbacks.
  * DLM_LSFL_NEWEXCL
  *   dlm_new_lockspace() should return -EEXIST if the lockspace exists.
  *
diff --git a/include/uapi/linux/dlm.h b/include/uapi/linux/dlm.h
index 0d2eca287567..1923f4f3b05e 100644
--- a/include/uapi/linux/dlm.h
+++ b/include/uapi/linux/dlm.h
@@ -69,7 +69,6 @@ struct dlm_lksb {
 /* dlm_new_lockspace() flags */
 
 #define DLM_LSFL_TIMEWARN	0x00000002
-#define DLM_LSFL_FS     	0x00000004
 #define DLM_LSFL_NEWEXCL     	0x00000008
 
 
-- 
cgit v1.2.3


From 91350fe152930c0d61a362af68272526490efea5 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Sun, 21 Aug 2022 14:35:17 +0300
Subject: bpf, flow_dissector: Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode
 for bpf progs

Currently, attaching BPF_PROG_TYPE_FLOW_DISSECTOR programs completely
replaces the flow-dissector logic with custom dissection logic. This
forces implementors to write programs that handle dissection for any
flows expected in the namespace.

It makes sense for flow-dissector BPF programs to just augment the
dissector with custom logic (e.g. dissecting certain flows or custom
protocols), while enjoying the broad capabilities of the standard
dissector for any other traffic.

Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode. Flow-dissector BPF
programs may return this to indicate no dissection was made, and
fallback to the standard dissector is requested.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220821113519.116765-3-shmulik.ladkani@gmail.com
---
 include/uapi/linux/bpf.h       | 5 +++++
 net/core/flow_dissector.c      | 3 +++
 tools/include/uapi/linux/bpf.h | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 934a2a8beb87..7f87012b012e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5861,6 +5861,11 @@ enum bpf_ret_code {
 	 *    represented by BPF_REDIRECT above).
 	 */
 	BPF_LWT_REROUTE = 128,
+	/* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR
+	 *   to indicate that no custom dissection was performed, and
+	 *   fallback to standard dissector is requested.
+	 */
+	BPF_FLOW_DISSECTOR_CONTINUE = 129,
 };
 
 struct bpf_sock {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a01817fb4ef4..990429c69ccd 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1022,11 +1022,14 @@ bool __skb_flow_dissect(const struct net *net,
 			prog = READ_ONCE(run_array->items[0].prog);
 			result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
 						  hlen, flags);
+			if (result == BPF_FLOW_DISSECTOR_CONTINUE)
+				goto dissect_continue;
 			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
 						 target_container);
 			rcu_read_unlock();
 			return result == BPF_OK;
 		}
+dissect_continue:
 		rcu_read_unlock();
 	}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1d6085e15fc8..f38814fbb618 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5861,6 +5861,11 @@ enum bpf_ret_code {
 	 *    represented by BPF_REDIRECT above).
 	 */
 	BPF_LWT_REROUTE = 128,
+	/* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR
+	 *   to indicate that no custom dissection was performed, and
+	 *   fallback to standard dissector is requested.
+	 */
+	BPF_FLOW_DISSECTOR_CONTINUE = 129,
 };
 
 struct bpf_sock {
-- 
cgit v1.2.3


From 2172fb8007eaafbef18563afb6c1ae5a976bf787 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 23 Aug 2022 15:25:54 -0700
Subject: bpf: update bpf_{g,s}et_retval documentation

* replace 'syscall' with 'upper layers', still mention that it's being
  exported via syscall errno
* describe what happens in set_retval(-EPERM) + return 1
* describe what happens with bind's 'return 3'

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220823222555.523590-5-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 22 +++++++++++++++++-----
 tools/include/uapi/linux/bpf.h | 22 +++++++++++++++++-----
 2 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7f87012b012e..644600dbb114 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5085,17 +5085,29 @@ union bpf_attr {
  *
  * int bpf_get_retval(void)
  *	Description
- *		Get the syscall's return value that will be returned to userspace.
+ *		Get the BPF program's return value that will be returned to the upper layers.
  *
- *		This helper is currently supported by cgroup programs only.
+ *		This helper is currently supported by cgroup programs and only by the hooks
+ *		where BPF program's return value is returned to the userspace via errno.
  *	Return
- *		The syscall's return value.
+ *		The BPF program's return value.
  *
  * int bpf_set_retval(int retval)
  *	Description
- *		Set the syscall's return value that will be returned to userspace.
+ *		Set the BPF program's return value that will be returned to the upper layers.
+ *
+ *		This helper is currently supported by cgroup programs and only by the hooks
+ *		where BPF program's return value is returned to the userspace via errno.
+ *
+ *		Note that there is the following corner case where the program exports an error
+ *		via bpf_set_retval but signals success via 'return 1':
+ *
+ *			bpf_set_retval(-EPERM);
+ *			return 1;
+ *
+ *		In this case, the BPF program's return value will use helper's -EPERM. This
+ *		still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case.
  *
- *		This helper is currently supported by cgroup programs only.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f38814fbb618..4fb685591035 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5085,17 +5085,29 @@ union bpf_attr {
  *
  * int bpf_get_retval(void)
  *	Description
- *		Get the syscall's return value that will be returned to userspace.
+ *		Get the BPF program's return value that will be returned to the upper layers.
  *
- *		This helper is currently supported by cgroup programs only.
+ *		This helper is currently supported by cgroup programs and only by the hooks
+ *		where BPF program's return value is returned to the userspace via errno.
  *	Return
- *		The syscall's return value.
+ *		The BPF program's return value.
  *
  * int bpf_set_retval(int retval)
  *	Description
- *		Set the syscall's return value that will be returned to userspace.
+ *		Set the BPF program's return value that will be returned to the upper layers.
+ *
+ *		This helper is currently supported by cgroup programs and only by the hooks
+ *		where BPF program's return value is returned to the userspace via errno.
+ *
+ *		Note that there is the following corner case where the program exports an error
+ *		via bpf_set_retval but signals success via 'return 1':
+ *
+ *			bpf_set_retval(-EPERM);
+ *			return 1;
+ *
+ *		In this case, the BPF program's return value will use helper's -EPERM. This
+ *		still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case.
  *
- *		This helper is currently supported by cgroup programs only.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
-- 
cgit v1.2.3


From 30b6055428a90cc52d4add164df12b94ab07c3fd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 19 Aug 2022 13:02:20 -0700
Subject: net: improve and fix netlink kdoc

Subsequent patch will render the kdoc from
include/uapi/linux/netlink.h into Documentation.
We need to fix the warnings. While at it move
the comments on struct nlmsghdr to a proper
kdoc comment.

Link: https://lore.kernel.org/r/20220819200221.422801-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/netlink.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1e543cf0568c..e0ab261ceca2 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -41,12 +41,20 @@ struct sockaddr_nl {
        	__u32		nl_groups;	/* multicast groups mask */
 };
 
+/**
+ * struct nlmsghdr - fixed format metadata header of Netlink messages
+ * @nlmsg_len:   Length of message including header
+ * @nlmsg_type:  Message content type
+ * @nlmsg_flags: Additional flags
+ * @nlmsg_seq:   Sequence number
+ * @nlmsg_pid:   Sending process port ID
+ */
 struct nlmsghdr {
-	__u32		nlmsg_len;	/* Length of message including header */
-	__u16		nlmsg_type;	/* Message content */
-	__u16		nlmsg_flags;	/* Additional flags */
-	__u32		nlmsg_seq;	/* Sequence number */
-	__u32		nlmsg_pid;	/* Sending process port ID */
+	__u32		nlmsg_len;
+	__u16		nlmsg_type;
+	__u16		nlmsg_flags;
+	__u32		nlmsg_seq;
+	__u32		nlmsg_pid;
 };
 
 /* Flags values */
@@ -337,6 +345,9 @@ enum netlink_attribute_type {
  *	bitfield32 type (U32)
  * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64)
  * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment
+ *
+ * @__NL_POLICY_TYPE_ATTR_MAX: number of attributes
+ * @NL_POLICY_TYPE_ATTR_MAX: highest attribute number
  */
 enum netlink_policy_type_attr {
 	NL_POLICY_TYPE_ATTR_UNSPEC,
-- 
cgit v1.2.3


From e7a7b84e33178db4a839c5e1773247be17597c1f Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Sat, 30 Jul 2022 10:56:43 +0530
Subject: wifi: cfg80211: Add link_id parameter to various key operations for
 MLO

Add support for various key operations on MLD by adding new parameter
link_id. Pass the link_id received from userspace to driver for add_key,
get_key, del_key, set_default_key, set_default_mgmt_key and
set_default_beacon_key to support configuring keys specific to each MLO
link. Userspace must not specify link ID for MLO pairwise key since it
is common for all the MLO links.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/20220730052643.1959111-4-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   8 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c        |  10 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  22 ++--
 drivers/net/wireless/marvell/libertas/cfg.c        |   9 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    |  10 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c |  17 +--
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  |  12 +-
 drivers/net/wireless/rndis_wlan.c                  |  20 ++--
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |  13 ++-
 drivers/staging/wlan-ng/cfg80211.c                 |  12 +-
 include/net/cfg80211.h                             |  37 ++++--
 include/uapi/linux/nl80211.h                       |  14 ++-
 net/mac80211/cfg.c                                 |  17 +--
 net/wireless/ibss.c                                |   2 +-
 net/wireless/nl80211.c                             | 126 ++++++++++++++++-----
 net/wireless/rdev-ops.h                            |  58 +++++-----
 net/wireless/sme.c                                 |   2 +-
 net/wireless/trace.h                               |  86 ++++++++------
 net/wireless/util.c                                |   4 +-
 net/wireless/wext-compat.c                         |  11 +-
 20 files changed, 312 insertions(+), 178 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index e11c7e9accc0..a20e0aeae284 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1124,7 +1124,7 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
 }
 
 static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev,
-				   u8 key_index, bool pairwise,
+				   int link_id, u8 key_index, bool pairwise,
 				   const u8 *mac_addr,
 				   struct key_params *params)
 {
@@ -1249,7 +1249,7 @@ static int ath6kl_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev,
-				   u8 key_index, bool pairwise,
+				   int link_id, u8 key_index, bool pairwise,
 				   const u8 *mac_addr)
 {
 	struct ath6kl *ar = ath6kl_priv(ndev);
@@ -1279,7 +1279,7 @@ static int ath6kl_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev,
-				   u8 key_index, bool pairwise,
+				   int link_id, u8 key_index, bool pairwise,
 				   const u8 *mac_addr, void *cookie,
 				   void (*callback) (void *cookie,
 						     struct key_params *))
@@ -1314,7 +1314,7 @@ static int ath6kl_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy,
-					   struct net_device *ndev,
+					   struct net_device *ndev, int link_id,
 					   u8 key_index, bool unicast,
 					   bool multicast)
 {
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index f93bdffa4d1d..40f9a7ef8980 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -1620,7 +1620,7 @@ static void wil_del_rx_key(u8 key_index, enum wmi_key_usage key_usage,
 }
 
 static int wil_cfg80211_add_key(struct wiphy *wiphy,
-				struct net_device *ndev,
+				struct net_device *ndev, int link_id,
 				u8 key_index, bool pairwise,
 				const u8 *mac_addr,
 				struct key_params *params)
@@ -1696,7 +1696,7 @@ static int wil_cfg80211_add_key(struct wiphy *wiphy,
 }
 
 static int wil_cfg80211_del_key(struct wiphy *wiphy,
-				struct net_device *ndev,
+				struct net_device *ndev, int link_id,
 				u8 key_index, bool pairwise,
 				const u8 *mac_addr)
 {
@@ -1723,7 +1723,7 @@ static int wil_cfg80211_del_key(struct wiphy *wiphy,
 
 /* Need to be present or wiphy_new() will WARN */
 static int wil_cfg80211_set_default_key(struct wiphy *wiphy,
-					struct net_device *ndev,
+					struct net_device *ndev, int link_id,
 					u8 key_index, bool unicast,
 					bool multicast)
 {
@@ -2072,8 +2072,8 @@ void wil_cfg80211_ap_recovery(struct wil6210_priv *wil)
 		key_params.key = vif->gtk;
 		key_params.key_len = vif->gtk_len;
 		key_params.seq_len = IEEE80211_GCMP_PN_LEN;
-		rc = wil_cfg80211_add_key(wiphy, ndev, vif->gtk_index, false,
-					  NULL, &key_params);
+		rc = wil_cfg80211_add_key(wiphy, ndev, -1, vif->gtk_index,
+					  false, NULL, &key_params);
 		if (rc)
 			wil_err(wil, "vif %d recovery add key failed (%d)\n",
 				i, rc);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 9d044af7991b..7c72ea26a7d7 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2361,7 +2361,8 @@ done:
 
 static s32
 brcmf_cfg80211_config_default_key(struct wiphy *wiphy, struct net_device *ndev,
-				  u8 key_idx, bool unicast, bool multicast)
+				  int link_id, u8 key_idx, bool unicast,
+				  bool multicast)
 {
 	struct brcmf_if *ifp = netdev_priv(ndev);
 	struct brcmf_pub *drvr = ifp->drvr;
@@ -2395,7 +2396,8 @@ done:
 
 static s32
 brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev,
-		       u8 key_idx, bool pairwise, const u8 *mac_addr)
+		       int link_id, u8 key_idx, bool pairwise,
+		       const u8 *mac_addr)
 {
 	struct brcmf_if *ifp = netdev_priv(ndev);
 	struct brcmf_wsec_key *key;
@@ -2432,8 +2434,8 @@ brcmf_cfg80211_del_key(struct wiphy *wiphy, struct net_device *ndev,
 
 static s32
 brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev,
-		       u8 key_idx, bool pairwise, const u8 *mac_addr,
-		       struct key_params *params)
+		       int link_id, u8 key_idx, bool pairwise,
+		       const u8 *mac_addr, struct key_params *params)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct brcmf_if *ifp = netdev_priv(ndev);
@@ -2457,8 +2459,8 @@ brcmf_cfg80211_add_key(struct wiphy *wiphy, struct net_device *ndev,
 	}
 
 	if (params->key_len == 0)
-		return brcmf_cfg80211_del_key(wiphy, ndev, key_idx, pairwise,
-					      mac_addr);
+		return brcmf_cfg80211_del_key(wiphy, ndev, -1, key_idx,
+					      pairwise, mac_addr);
 
 	if (params->key_len > sizeof(key->data)) {
 		bphy_err(drvr, "Too long key length (%u)\n", params->key_len);
@@ -2553,8 +2555,9 @@ done:
 }
 
 static s32
-brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev, u8 key_idx,
-		       bool pairwise, const u8 *mac_addr, void *cookie,
+brcmf_cfg80211_get_key(struct wiphy *wiphy, struct net_device *ndev,
+		       int link_id, u8 key_idx, bool pairwise,
+		       const u8 *mac_addr, void *cookie,
 		       void (*callback)(void *cookie,
 					struct key_params *params))
 {
@@ -2610,7 +2613,8 @@ done:
 
 static s32
 brcmf_cfg80211_config_default_mgmt_key(struct wiphy *wiphy,
-				       struct net_device *ndev, u8 key_idx)
+				       struct net_device *ndev, int link_id,
+				       u8 key_idx)
 {
 	struct brcmf_if *ifp = netdev_priv(ndev);
 
diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c
index b0b3f59dabc6..5e3ae00153b8 100644
--- a/drivers/net/wireless/marvell/libertas/cfg.c
+++ b/drivers/net/wireless/marvell/libertas/cfg.c
@@ -1435,7 +1435,7 @@ static int lbs_cfg_disconnect(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int lbs_cfg_set_default_key(struct wiphy *wiphy,
-				   struct net_device *netdev,
+				   struct net_device *netdev, int link_id,
 				   u8 key_index, bool unicast,
 				   bool multicast)
 {
@@ -1455,8 +1455,8 @@ static int lbs_cfg_set_default_key(struct wiphy *wiphy,
 
 
 static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 idx, bool pairwise, const u8 *mac_addr,
-			   struct key_params *params)
+			   int link_id, u8 idx, bool pairwise,
+			   const u8 *mac_addr, struct key_params *params)
 {
 	struct lbs_private *priv = wiphy_priv(wiphy);
 	u16 key_info;
@@ -1516,7 +1516,8 @@ static int lbs_cfg_add_key(struct wiphy *wiphy, struct net_device *netdev,
 
 
 static int lbs_cfg_del_key(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 key_index, bool pairwise, const u8 *mac_addr)
+			   int link_id, u8 key_index, bool pairwise,
+			   const u8 *mac_addr)
 {
 
 	lbs_deb_assoc("del_key: key_idx %d, mac_addr %pM\n",
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index d68c40e0e122..4f3bfdbe0d9d 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -154,7 +154,8 @@ static void *mwifiex_cfg80211_get_adapter(struct wiphy *wiphy)
  */
 static int
 mwifiex_cfg80211_del_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr)
+			 int link_id, u8 key_index, bool pairwise,
+			 const u8 *mac_addr)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev);
 	static const u8 bc_mac[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
@@ -443,7 +444,7 @@ mwifiex_cfg80211_set_power_mgmt(struct wiphy *wiphy,
  */
 static int
 mwifiex_cfg80211_set_default_key(struct wiphy *wiphy, struct net_device *netdev,
-				 u8 key_index, bool unicast,
+				 int link_id, u8 key_index, bool unicast,
 				 bool multicast)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev);
@@ -468,8 +469,8 @@ mwifiex_cfg80211_set_default_key(struct wiphy *wiphy, struct net_device *netdev,
  */
 static int
 mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr,
-			 struct key_params *params)
+			 int link_id, u8 key_index, bool pairwise,
+			 const u8 *mac_addr, struct key_params *params)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev);
 	struct mwifiex_wep_key *wep_key;
@@ -506,6 +507,7 @@ mwifiex_cfg80211_add_key(struct wiphy *wiphy, struct net_device *netdev,
 static int
 mwifiex_cfg80211_set_default_mgmt_key(struct wiphy *wiphy,
 				      struct net_device *netdev,
+				      int link_id,
 				      u8 key_index)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(netdev);
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index 3ac373d29d93..f810a56a7ff0 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -540,8 +540,9 @@ static int wilc_wfi_cfg_copy_wpa_info(struct wilc_wfi_key *key_info,
 	return 0;
 }
 
-static int add_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		   bool pairwise, const u8 *mac_addr, struct key_params *params)
+static int add_key(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		   u8 key_index, bool pairwise, const u8 *mac_addr,
+		   struct key_params *params)
 
 {
 	int ret = 0, keylen = params->key_len;
@@ -644,7 +645,7 @@ static int add_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
 	return ret;
 }
 
-static int del_key(struct wiphy *wiphy, struct net_device *netdev,
+static int del_key(struct wiphy *wiphy, struct net_device *netdev, int link_id,
 		   u8 key_index,
 		   bool pairwise,
 		   const u8 *mac_addr)
@@ -685,8 +686,9 @@ static int del_key(struct wiphy *wiphy, struct net_device *netdev,
 	return 0;
 }
 
-static int get_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		   bool pairwise, const u8 *mac_addr, void *cookie,
+static int get_key(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		   u8 key_index, bool pairwise, const u8 *mac_addr,
+		   void *cookie,
 		   void (*callback)(void *cookie, struct key_params *))
 {
 	struct wilc_vif *vif = netdev_priv(netdev);
@@ -723,13 +725,14 @@ static int get_key(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
 
 /* wiphy_new_nm() will WARNON if not present */
 static int set_default_key(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 key_index, bool unicast, bool multicast)
+			   int link_id, u8 key_index, bool unicast,
+			   bool multicast)
 {
 	return 0;
 }
 
 static int set_default_mgmt_key(struct wiphy *wiphy, struct net_device *netdev,
-				u8 key_index)
+				int link_id, u8 key_index)
 {
 	struct wilc_vif *vif = netdev_priv(netdev);
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 83df5977400e..4d796f5a3221 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -532,8 +532,8 @@ qtnf_dump_station(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev,
-			u8 key_index, bool pairwise, const u8 *mac_addr,
-			struct key_params *params)
+			int link_id, u8 key_index, bool pairwise,
+			const u8 *mac_addr, struct key_params *params)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
@@ -548,7 +548,8 @@ static int qtnf_add_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int qtnf_del_key(struct wiphy *wiphy, struct net_device *dev,
-			u8 key_index, bool pairwise, const u8 *mac_addr)
+			int link_id, u8 key_index, bool pairwise,
+			const u8 *mac_addr)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
@@ -569,7 +570,8 @@ static int qtnf_del_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int qtnf_set_default_key(struct wiphy *wiphy, struct net_device *dev,
-				u8 key_index, bool unicast, bool multicast)
+				int link_id, u8 key_index, bool unicast,
+				bool multicast)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
@@ -585,7 +587,7 @@ static int qtnf_set_default_key(struct wiphy *wiphy, struct net_device *dev,
 
 static int
 qtnf_set_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev,
-			  u8 key_index)
+			  int link_id, u8 key_index)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(dev);
 	int ret;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 05524291d60c..325933217b41 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -489,14 +489,16 @@ static int rndis_join_ibss(struct wiphy *wiphy, struct net_device *dev,
 static int rndis_leave_ibss(struct wiphy *wiphy, struct net_device *dev);
 
 static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr,
-			 struct key_params *params);
+			 int link_id,  u8 key_index, bool pairwise,
+			 const u8 *mac_addr, struct key_params *params);
 
 static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr);
+			 int link_id, u8 key_index, bool pairwise,
+			 const u8 *mac_addr);
 
 static int rndis_set_default_key(struct wiphy *wiphy, struct net_device *netdev,
-				 u8 key_index, bool unicast, bool multicast);
+				 int link_id, u8 key_index, bool unicast,
+				 bool multicast);
 
 static int rndis_get_station(struct wiphy *wiphy, struct net_device *dev,
 			     const u8 *mac, struct station_info *sinfo);
@@ -2377,8 +2379,8 @@ static int rndis_leave_ibss(struct wiphy *wiphy, struct net_device *dev)
 }
 
 static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr,
-			 struct key_params *params)
+			 int link_id,  u8 key_index, bool pairwise,
+			 const u8 *mac_addr, struct key_params *params)
 {
 	struct rndis_wlan_private *priv = wiphy_priv(wiphy);
 	struct usbnet *usbdev = priv->usbdev;
@@ -2413,7 +2415,8 @@ static int rndis_add_key(struct wiphy *wiphy, struct net_device *netdev,
 }
 
 static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev,
-			 u8 key_index, bool pairwise, const u8 *mac_addr)
+			 int link_id, u8 key_index, bool pairwise,
+			 const u8 *mac_addr)
 {
 	struct rndis_wlan_private *priv = wiphy_priv(wiphy);
 	struct usbnet *usbdev = priv->usbdev;
@@ -2424,7 +2427,8 @@ static int rndis_del_key(struct wiphy *wiphy, struct net_device *netdev,
 }
 
 static int rndis_set_default_key(struct wiphy *wiphy, struct net_device *netdev,
-				 u8 key_index, bool unicast, bool multicast)
+				 int link_id, u8 key_index, bool unicast,
+				 bool multicast)
 {
 	struct rndis_wlan_private *priv = wiphy_priv(wiphy);
 	struct usbnet *usbdev = priv->usbdev;
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index cf35125b7891..1632a2654cc5 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -901,8 +901,8 @@ exit:
 }
 
 static int cfg80211_rtw_add_key(struct wiphy *wiphy, struct net_device *ndev,
-				u8 key_index, bool pairwise, const u8 *mac_addr,
-				struct key_params *params)
+				int link_id, u8 key_index, bool pairwise,
+				const u8 *mac_addr, struct key_params *params)
 {
 	char *alg_name;
 	u32 param_len;
@@ -993,8 +993,8 @@ addkey_end:
 }
 
 static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev,
-				u8 key_index, bool pairwise, const u8 *mac_addr,
-				void *cookie,
+				int link_id, u8 key_index, bool pairwise,
+				const u8 *mac_addr, void *cookie,
 				void (*callback)(void *cookie,
 						 struct key_params*))
 {
@@ -1002,7 +1002,8 @@ static int cfg80211_rtw_get_key(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct net_device *ndev,
-				u8 key_index, bool pairwise, const u8 *mac_addr)
+				int link_id, u8 key_index, bool pairwise,
+				const u8 *mac_addr)
 {
 	struct adapter *padapter = rtw_netdev_priv(ndev);
 	struct security_priv *psecuritypriv = &padapter->securitypriv;
@@ -1017,7 +1018,7 @@ static int cfg80211_rtw_del_key(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int cfg80211_rtw_set_default_key(struct wiphy *wiphy,
-	struct net_device *ndev, u8 key_index
+	struct net_device *ndev, int link_id, u8 key_index
 	, bool unicast, bool multicast
 	)
 {
diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c
index b7b56d8406d1..471bb310176f 100644
--- a/drivers/staging/wlan-ng/cfg80211.c
+++ b/drivers/staging/wlan-ng/cfg80211.c
@@ -143,8 +143,8 @@ exit:
 }
 
 static int prism2_add_key(struct wiphy *wiphy, struct net_device *dev,
-			  u8 key_index, bool pairwise, const u8 *mac_addr,
-			  struct key_params *params)
+			  int link_id, u8 key_index, bool pairwise,
+			  const u8 *mac_addr, struct key_params *params)
 {
 	struct wlandevice *wlandev = dev->ml_priv;
 	u32 did;
@@ -172,7 +172,7 @@ static int prism2_add_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int prism2_get_key(struct wiphy *wiphy, struct net_device *dev,
-			  u8 key_index, bool pairwise,
+			  int link_id, u8 key_index, bool pairwise,
 			  const u8 *mac_addr, void *cookie,
 			  void (*callback)(void *cookie, struct key_params*))
 {
@@ -202,7 +202,8 @@ static int prism2_get_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int prism2_del_key(struct wiphy *wiphy, struct net_device *dev,
-			  u8 key_index, bool pairwise, const u8 *mac_addr)
+			  int link_id, u8 key_index, bool pairwise,
+			  const u8 *mac_addr)
 {
 	struct wlandevice *wlandev = dev->ml_priv;
 	u32 did;
@@ -227,7 +228,8 @@ static int prism2_del_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int prism2_set_default_key(struct wiphy *wiphy, struct net_device *dev,
-				  u8 key_index, bool unicast, bool multicast)
+				  int link_id, u8 key_index, bool unicast,
+				  bool multicast)
 {
 	struct wlandevice *wlandev = dev->ml_priv;
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1d393e09a632..ffbc65a9b00b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3931,22 +3931,33 @@ struct mgmt_frame_regs {
  * @del_intf_link: Remove an MLO link from the given interface.
  *
  * @add_key: add a key with the given parameters. @mac_addr will be %NULL
- *	when adding a group key.
+ *	when adding a group key. @link_id will be -1 for non-MLO connection.
+ *	For MLO connection, @link_id will be >= 0 for group key and -1 for
+ *	pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key.
  *
  * @get_key: get information about the key with the given parameters.
  *	@mac_addr will be %NULL when requesting information for a group
  *	key. All pointers given to the @callback function need not be valid
  *	after it returns. This function should return an error if it is
  *	not possible to retrieve the key, -ENOENT if it doesn't exist.
+ *	@link_id will be -1 for non-MLO connection. For MLO connection,
+ *	@link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr
+ *	will be peer's MLD address for MLO pairwise key.
  *
  * @del_key: remove a key given the @mac_addr (%NULL for a group key)
- *	and @key_index, return -ENOENT if the key doesn't exist.
+ *	and @key_index, return -ENOENT if the key doesn't exist. @link_id will
+ *	be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0
+ *	for group key and -1 for pairwise key, @mac_addr will be peer's MLD
+ *	address for MLO pairwise key.
  *
- * @set_default_key: set the default key on an interface
+ * @set_default_key: set the default key on an interface. @link_id will be >= 0
+ *	for MLO connection and -1 for non-MLO connection.
  *
- * @set_default_mgmt_key: set the default management frame key on an interface
+ * @set_default_mgmt_key: set the default management frame key on an interface.
+ *	@link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
  *
- * @set_default_beacon_key: set the default Beacon frame key on an interface
+ * @set_default_beacon_key: set the default Beacon frame key on an interface.
+ *	@link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
  *
  * @set_rekey_data: give the data necessary for GTK rekeying to the driver
  *
@@ -4295,22 +4306,24 @@ struct cfg80211_ops {
 				 unsigned int link_id);
 
 	int	(*add_key)(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 key_index, bool pairwise, const u8 *mac_addr,
-			   struct key_params *params);
+			   int link_id, u8 key_index, bool pairwise,
+			   const u8 *mac_addr, struct key_params *params);
 	int	(*get_key)(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 key_index, bool pairwise, const u8 *mac_addr,
-			   void *cookie,
+			   int link_id, u8 key_index, bool pairwise,
+			   const u8 *mac_addr, void *cookie,
 			   void (*callback)(void *cookie, struct key_params*));
 	int	(*del_key)(struct wiphy *wiphy, struct net_device *netdev,
-			   u8 key_index, bool pairwise, const u8 *mac_addr);
+			   int link_id, u8 key_index, bool pairwise,
+			   const u8 *mac_addr);
 	int	(*set_default_key)(struct wiphy *wiphy,
-				   struct net_device *netdev,
+				   struct net_device *netdev, int link_id,
 				   u8 key_index, bool unicast, bool multicast);
 	int	(*set_default_mgmt_key)(struct wiphy *wiphy,
-					struct net_device *netdev,
+					struct net_device *netdev, int link_id,
 					u8 key_index);
 	int	(*set_default_beacon_key)(struct wiphy *wiphy,
 					  struct net_device *netdev,
+					  int link_id,
 					  u8 key_index);
 
 	int	(*start_ap)(struct wiphy *wiphy, struct net_device *dev,
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ffb7c573e299..573db20403dc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -377,14 +377,22 @@
  *	the non-transmitting interfaces are deleted as well.
  *
  * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified
- *	by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC.
+ *	by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC
+ *	represents peer's MLD address for MLO pairwise key. For MLO group key,
+ *	the link is identified by %NL80211_ATTR_MLO_LINK_ID.
  * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT,
  *	%NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD.
+ *	For MLO connection, the link to set default key is identified by
+ *	%NL80211_ATTR_MLO_LINK_ID.
  * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA,
  *	%NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC, %NL80211_ATTR_KEY_CIPHER,
- *	and %NL80211_ATTR_KEY_SEQ attributes.
+ *	and %NL80211_ATTR_KEY_SEQ attributes. %NL80211_ATTR_MAC represents
+ *	peer's MLD address for MLO pairwise key. The link to add MLO
+ *	group key is identified by %NL80211_ATTR_MLO_LINK_ID.
  * @NL80211_CMD_DEL_KEY: delete a key identified by %NL80211_ATTR_KEY_IDX
- *	or %NL80211_ATTR_MAC.
+ *	or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC represents peer's MLD address
+ *	for MLO pairwise key. The link to delete group key is identified by
+ *	%NL80211_ATTR_MLO_LINK_ID.
  *
  * @NL80211_CMD_GET_BEACON: (not used)
  * @NL80211_CMD_SET_BEACON: change the beacon on an access point interface
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d97e13b5c3a8..c4c5e2d44eb8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -452,8 +452,8 @@ static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata,
 }
 
 static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, bool pairwise, const u8 *mac_addr,
-			     struct key_params *params)
+			     int link_id, u8 key_idx, bool pairwise,
+			     const u8 *mac_addr, struct key_params *params)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
@@ -596,7 +596,8 @@ ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata,
 }
 
 static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, bool pairwise, const u8 *mac_addr)
+			     int link_id, u8 key_idx, bool pairwise,
+			     const u8 *mac_addr)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
@@ -623,8 +624,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, bool pairwise, const u8 *mac_addr,
-			     void *cookie,
+			     int link_id, u8 key_idx, bool pairwise,
+			     const u8 *mac_addr, void *cookie,
 			     void (*callback)(void *cookie,
 					      struct key_params *params))
 {
@@ -729,7 +730,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 
 static int ieee80211_config_default_key(struct wiphy *wiphy,
 					struct net_device *dev,
-					u8 key_idx, bool uni,
+					int link_id, u8 key_idx, bool uni,
 					bool multi)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -741,7 +742,7 @@ static int ieee80211_config_default_key(struct wiphy *wiphy,
 
 static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
 					     struct net_device *dev,
-					     u8 key_idx)
+					     int link_id, u8 key_idx)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
@@ -752,7 +753,7 @@ static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
 
 static int ieee80211_config_default_beacon_key(struct wiphy *wiphy,
 					       struct net_device *dev,
-					       u8 key_idx)
+					       int link_id, u8 key_idx)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 4935f94d1acc..edd062f104f4 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -171,7 +171,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 	 */
 	if (rdev->ops->del_key)
 		for (i = 0; i < 6; i++)
-			rdev_del_key(rdev, dev, i, false, NULL);
+			rdev_del_key(rdev, dev, -1, i, false, NULL);
 
 	if (wdev->u.ibss.current_bss) {
 		cfg80211_unhold_bss(wdev->u.ibss.current_bss);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e2169c364ae1..72242681ab86 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1545,7 +1545,6 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 		return -ENOLINK;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
-		/* for MLO, require driver validation of the link ID */
 		if (wdev->connected)
 			return 0;
 		return -ENOLINK;
@@ -4333,6 +4332,38 @@ static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_noack_map(rdev, dev, noack_map);
 }
 
+static int nl80211_validate_key_link_id(struct genl_info *info,
+					struct wireless_dev *wdev,
+					int link_id, bool pairwise)
+{
+	if (pairwise) {
+		if (link_id != -1) {
+			GENL_SET_ERR_MSG(info,
+					 "link ID not allowed for pairwise key");
+			return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	if (wdev->valid_links) {
+		if (link_id == -1) {
+			GENL_SET_ERR_MSG(info,
+					 "link ID must for MLO group key");
+			return -EINVAL;
+		}
+		if (!(wdev->valid_links & BIT(link_id))) {
+			GENL_SET_ERR_MSG(info, "invalid link ID for MLO group key");
+			return -EINVAL;
+		}
+	} else if (link_id != -1) {
+		GENL_SET_ERR_MSG(info, "link ID not allowed for non-MLO group key");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 struct get_key_cookie {
 	struct sk_buff *msg;
 	int error;
@@ -4394,13 +4425,15 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	void *hdr;
 	struct sk_buff *msg;
 	bool bigtk_support = false;
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	if (wiphy_ext_feature_isset(&rdev->wiphy,
 				    NL80211_EXT_FEATURE_BEACON_PROTECTION))
 		bigtk_support = true;
 
-	if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION ||
-	     dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
+	if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+	     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
 	    wiphy_ext_feature_isset(&rdev->wiphy,
 				    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
 		bigtk_support = true;
@@ -4452,8 +4485,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr))
 		goto nla_put_failure;
 
-	err = rdev_get_key(rdev, dev, key_idx, pairwise, mac_addr, &cookie,
-			   get_key_callback);
+	err = nl80211_validate_key_link_id(info, wdev, link_id, pairwise);
+	if (err)
+		goto free_msg;
+
+	err = rdev_get_key(rdev, dev, link_id, key_idx, pairwise, mac_addr,
+			   &cookie, get_key_callback);
 
 	if (err)
 		goto free_msg;
@@ -4477,6 +4514,8 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 	struct key_parse key;
 	int err;
 	struct net_device *dev = info->user_ptr[1];
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	err = nl80211_parse_key(info, &key);
 	if (err)
@@ -4492,7 +4531,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 	    !(key.p.mode == NL80211_KEY_SET_TX))
 		return -EINVAL;
 
-	wdev_lock(dev->ieee80211_ptr);
+	wdev_lock(wdev);
 
 	if (key.def) {
 		if (!rdev->ops->set_default_key) {
@@ -4500,18 +4539,22 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 
-		err = nl80211_key_allowed(dev->ieee80211_ptr);
+		err = nl80211_key_allowed(wdev);
+		if (err)
+			goto out;
+
+		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
 			goto out;
 
-		err = rdev_set_default_key(rdev, dev, key.idx,
-						 key.def_uni, key.def_multi);
+		err = rdev_set_default_key(rdev, dev, link_id, key.idx,
+					   key.def_uni, key.def_multi);
 
 		if (err)
 			goto out;
 
 #ifdef CONFIG_CFG80211_WEXT
-		dev->ieee80211_ptr->wext.default_key = key.idx;
+		wdev->wext.default_key = key.idx;
 #endif
 	} else if (key.defmgmt) {
 		if (key.def_uni || !key.def_multi) {
@@ -4524,16 +4567,20 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 
-		err = nl80211_key_allowed(dev->ieee80211_ptr);
+		err = nl80211_key_allowed(wdev);
+		if (err)
+			goto out;
+
+		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
 			goto out;
 
-		err = rdev_set_default_mgmt_key(rdev, dev, key.idx);
+		err = rdev_set_default_mgmt_key(rdev, dev, link_id, key.idx);
 		if (err)
 			goto out;
 
 #ifdef CONFIG_CFG80211_WEXT
-		dev->ieee80211_ptr->wext.default_mgmt_key = key.idx;
+		wdev->wext.default_mgmt_key = key.idx;
 #endif
 	} else if (key.defbeacon) {
 		if (key.def_uni || !key.def_multi) {
@@ -4546,11 +4593,15 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 
-		err = nl80211_key_allowed(dev->ieee80211_ptr);
+		err = nl80211_key_allowed(wdev);
+		if (err)
+			goto out;
+
+		err = nl80211_validate_key_link_id(info, wdev, link_id, false);
 		if (err)
 			goto out;
 
-		err = rdev_set_default_beacon_key(rdev, dev, key.idx);
+		err = rdev_set_default_beacon_key(rdev, dev, link_id, key.idx);
 		if (err)
 			goto out;
 	} else if (key.p.mode == NL80211_KEY_SET_TX &&
@@ -4566,14 +4617,18 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 
-		err = rdev_add_key(rdev, dev, key.idx,
+		err = nl80211_validate_key_link_id(info, wdev, link_id, true);
+		if (err)
+			goto out;
+
+		err = rdev_add_key(rdev, dev, link_id, key.idx,
 				   NL80211_KEYTYPE_PAIRWISE,
 				   mac_addr, &key.p);
 	} else {
 		err = -EINVAL;
 	}
  out:
-	wdev_unlock(dev->ieee80211_ptr);
+	wdev_unlock(wdev);
 
 	return err;
 }
@@ -4585,6 +4640,8 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev = info->user_ptr[1];
 	struct key_parse key;
 	const u8 *mac_addr = NULL;
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	err = nl80211_parse_key(info, &key);
 	if (err)
@@ -4626,18 +4683,23 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
-	wdev_lock(dev->ieee80211_ptr);
-	err = nl80211_key_allowed(dev->ieee80211_ptr);
+	wdev_lock(wdev);
+	err = nl80211_key_allowed(wdev);
 	if (err)
 		GENL_SET_ERR_MSG(info, "key not allowed");
+
+	if (!err)
+		err = nl80211_validate_key_link_id(info, wdev, link_id,
+				key.type == NL80211_KEYTYPE_PAIRWISE);
+
 	if (!err) {
-		err = rdev_add_key(rdev, dev, key.idx,
+		err = rdev_add_key(rdev, dev, link_id, key.idx,
 				   key.type == NL80211_KEYTYPE_PAIRWISE,
 				    mac_addr, &key.p);
 		if (err)
 			GENL_SET_ERR_MSG(info, "key addition failed");
 	}
-	wdev_unlock(dev->ieee80211_ptr);
+	wdev_unlock(wdev);
 
 	return err;
 }
@@ -4649,6 +4711,8 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev = info->user_ptr[1];
 	u8 *mac_addr = NULL;
 	struct key_parse key;
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	err = nl80211_parse_key(info, &key);
 	if (err)
@@ -4676,27 +4740,31 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->del_key)
 		return -EOPNOTSUPP;
 
-	wdev_lock(dev->ieee80211_ptr);
-	err = nl80211_key_allowed(dev->ieee80211_ptr);
+	wdev_lock(wdev);
+	err = nl80211_key_allowed(wdev);
 
 	if (key.type == NL80211_KEYTYPE_GROUP && mac_addr &&
 	    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
 		err = -ENOENT;
 
 	if (!err)
-		err = rdev_del_key(rdev, dev, key.idx,
+		err = nl80211_validate_key_link_id(info, wdev, link_id,
+				key.type == NL80211_KEYTYPE_PAIRWISE);
+
+	if (!err)
+		err = rdev_del_key(rdev, dev, link_id, key.idx,
 				   key.type == NL80211_KEYTYPE_PAIRWISE,
 				   mac_addr);
 
 #ifdef CONFIG_CFG80211_WEXT
 	if (!err) {
-		if (key.idx == dev->ieee80211_ptr->wext.default_key)
-			dev->ieee80211_ptr->wext.default_key = -1;
-		else if (key.idx == dev->ieee80211_ptr->wext.default_mgmt_key)
-			dev->ieee80211_ptr->wext.default_mgmt_key = -1;
+		if (key.idx == wdev->wext.default_key)
+			wdev->wext.default_key = -1;
+		else if (key.idx == wdev->wext.default_mgmt_key)
+			wdev->wext.default_mgmt_key = -1;
 	}
 #endif
-	wdev_unlock(dev->ieee80211_ptr);
+	wdev_unlock(wdev);
 
 	return err;
 }
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 40915a82da73..13b209a8db28 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -77,65 +77,69 @@ rdev_change_virtual_intf(struct cfg80211_registered_device *rdev,
 }
 
 static inline int rdev_add_key(struct cfg80211_registered_device *rdev,
-			       struct net_device *netdev, u8 key_index,
-			       bool pairwise, const u8 *mac_addr,
+			       struct net_device *netdev, int link_id,
+			       u8 key_index, bool pairwise, const u8 *mac_addr,
 			       struct key_params *params)
 {
 	int ret;
-	trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise,
+	trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
 			   mac_addr, params->mode);
-	ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise,
-				  mac_addr, params);
+	ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index,
+				  pairwise, mac_addr, params);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int
 rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev,
-	     u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie,
+	     int link_id, u8 key_index, bool pairwise, const u8 *mac_addr,
+	     void *cookie,
 	     void (*callback)(void *cookie, struct key_params*))
 {
 	int ret;
-	trace_rdev_get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
-	ret = rdev->ops->get_key(&rdev->wiphy, netdev, key_index, pairwise,
-				  mac_addr, cookie, callback);
+	trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
+			   mac_addr);
+	ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index,
+				  pairwise, mac_addr, cookie, callback);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int rdev_del_key(struct cfg80211_registered_device *rdev,
-			       struct net_device *netdev, u8 key_index,
-			       bool pairwise, const u8 *mac_addr)
+			       struct net_device *netdev, int link_id,
+			       u8 key_index, bool pairwise, const u8 *mac_addr)
 {
 	int ret;
-	trace_rdev_del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
-	ret = rdev->ops->del_key(&rdev->wiphy, netdev, key_index, pairwise,
-				  mac_addr);
+	trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
+			   mac_addr);
+	ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index,
+				  pairwise, mac_addr);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int
 rdev_set_default_key(struct cfg80211_registered_device *rdev,
-		     struct net_device *netdev, u8 key_index, bool unicast,
-		     bool multicast)
+		     struct net_device *netdev, int link_id, u8 key_index,
+		     bool unicast, bool multicast)
 {
 	int ret;
-	trace_rdev_set_default_key(&rdev->wiphy, netdev, key_index,
+	trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index,
 				   unicast, multicast);
-	ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, key_index,
-					  unicast, multicast);
+	ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id,
+					  key_index, unicast, multicast);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int
 rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
-			  struct net_device *netdev, u8 key_index)
+			  struct net_device *netdev, int link_id, u8 key_index)
 {
 	int ret;
-	trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, key_index);
-	ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev,
+	trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
+					key_index);
+	ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
 					       key_index);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
@@ -143,13 +147,15 @@ rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
 
 static inline int
 rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev,
-			    struct net_device *netdev, u8 key_index)
+			    struct net_device *netdev, int link_id,
+			    u8 key_index)
 {
 	int ret;
 
-	trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index);
-	ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev,
-						key_index);
+	trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id,
+					  key_index);
+	ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id,
+						 key_index);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 34d27a3070f0..0a5c95631f78 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1326,7 +1326,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
 			max_key_idx = 7;
 		for (i = 0; i <= max_key_idx; i++)
-			rdev_del_key(rdev, dev, i, false, NULL);
+			rdev_del_key(rdev, dev, -1, i, false, NULL);
 	}
 
 	rdev_set_qos_map(rdev, dev, NULL);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 10b2fd9bacb5..001c00d9c5e7 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -434,13 +434,14 @@ TRACE_EVENT(rdev_change_virtual_intf,
 );
 
 DECLARE_EVENT_CLASS(key_handle,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		 bool pairwise, const u8 *mac_addr),
-	TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr),
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index, bool pairwise, const u8 *mac_addr),
+	TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		MAC_ENTRY(mac_addr)
+		__field(int, link_id)
 		__field(u8, key_index)
 		__field(bool, pairwise)
 	),
@@ -448,34 +449,38 @@ DECLARE_EVENT_CLASS(key_handle,
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		MAC_ASSIGN(mac_addr, mac_addr);
+		__entry->link_id = link_id;
 		__entry->key_index = key_index;
 		__entry->pairwise = pairwise;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
-		  BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr))
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
+		  "key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
+		  __entry->key_index, BOOL_TO_STR(__entry->pairwise),
+		  MAC_PR_ARG(mac_addr))
 );
 
 DEFINE_EVENT(key_handle, rdev_get_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		 bool pairwise, const u8 *mac_addr),
-	TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index, bool pairwise, const u8 *mac_addr),
+	TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
 );
 
 DEFINE_EVENT(key_handle, rdev_del_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		 bool pairwise, const u8 *mac_addr),
-	TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index, bool pairwise, const u8 *mac_addr),
+	TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
 );
 
 TRACE_EVENT(rdev_add_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		 bool pairwise, const u8 *mac_addr, u8 mode),
-	TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode),
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode),
+	TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
 		MAC_ENTRY(mac_addr)
+		__field(int, link_id)
 		__field(u8, key_index)
 		__field(bool, pairwise)
 		__field(u8, mode)
@@ -484,24 +489,27 @@ TRACE_EVENT(rdev_add_key,
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
 		MAC_ASSIGN(mac_addr, mac_addr);
+		__entry->link_id = link_id;
 		__entry->key_index = key_index;
 		__entry->pairwise = pairwise;
 		__entry->mode = mode;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, "
-		  "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
-		  __entry->mode, BOOL_TO_STR(__entry->pairwise),
-		  MAC_PR_ARG(mac_addr))
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
+		  "key_index: %u, mode: %u, pairwise: %s, "
+		  "mac addr: " MAC_PR_FMT,
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
+		  __entry->key_index, __entry->mode,
+		  BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr))
 );
 
 TRACE_EVENT(rdev_set_default_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
-		 bool unicast, bool multicast),
-	TP_ARGS(wiphy, netdev, key_index, unicast, multicast),
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index, bool unicast, bool multicast),
+	TP_ARGS(wiphy, netdev, link_id, key_index, unicast, multicast),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(int, link_id)
 		__field(u8, key_index)
 		__field(bool, unicast)
 		__field(bool, multicast)
@@ -509,48 +517,58 @@ TRACE_EVENT(rdev_set_default_key,
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
 		__entry->key_index = key_index;
 		__entry->unicast = unicast;
 		__entry->multicast = multicast;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u, unicast: %s, multicast: %s",
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
-		  BOOL_TO_STR(__entry->unicast),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
+		  "key index: %u, unicast: %s, multicast: %s",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
+		  __entry->key_index, BOOL_TO_STR(__entry->unicast),
 		  BOOL_TO_STR(__entry->multicast))
 );
 
 TRACE_EVENT(rdev_set_default_mgmt_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
-	TP_ARGS(wiphy, netdev, key_index),
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index),
+	TP_ARGS(wiphy, netdev, link_id, key_index),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(int, link_id)
 		__field(u8, key_index)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
 		__entry->key_index = key_index;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
+		  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
+		  __entry->link_id, __entry->key_index)
 );
 
 TRACE_EVENT(rdev_set_default_beacon_key,
-	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
-	TP_ARGS(wiphy, netdev, key_index),
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
+		 u8 key_index),
+	TP_ARGS(wiphy, netdev, link_id, key_index),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		NETDEV_ENTRY
+		__field(int, link_id)
 		__field(u8, key_index)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		NETDEV_ASSIGN;
+		__entry->link_id = link_id;
 		__entry->key_index = key_index;
 	),
-	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
-		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
+		  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
+		  __entry->link_id, __entry->key_index)
 );
 
 TRACE_EVENT(rdev_start_ap,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 2c127951764a..0b28d00ba8f5 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -935,13 +935,13 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
 	for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) {
 		if (!wdev->connect_keys->params[i].cipher)
 			continue;
-		if (rdev_add_key(rdev, dev, i, false, NULL,
+		if (rdev_add_key(rdev, dev, -1, i, false, NULL,
 				 &wdev->connect_keys->params[i])) {
 			netdev_err(dev, "failed to set key %d\n", i);
 			continue;
 		}
 		if (wdev->connect_keys->def == i &&
-		    rdev_set_default_key(rdev, dev, i, true, true)) {
+		    rdev_set_default_key(rdev, dev, -1, i, true, true)) {
 			netdev_err(dev, "failed to set defkey %d\n", i);
 			continue;
 		}
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 129d3bb91dfb..ddf340bfa07a 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -470,7 +470,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 			    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
 				err = -ENOENT;
 			else
-				err = rdev_del_key(rdev, dev, idx, pairwise,
+				err = rdev_del_key(rdev, dev, -1, idx, pairwise,
 						   addr);
 		}
 		wdev->wext.connect.privacy = false;
@@ -509,7 +509,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	if (wdev->connected ||
 	    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
 	     wdev->u.ibss.current_bss))
-		err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
+		err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params);
 	else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
 		 params->cipher != WLAN_CIPHER_SUITE_WEP104)
 		return -EINVAL;
@@ -546,7 +546,8 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
 				rejoin = true;
 			}
-			err = rdev_set_default_key(rdev, dev, idx, true, true);
+			err = rdev_set_default_key(rdev, dev, -1, idx, true,
+						   true);
 		}
 		if (!err) {
 			wdev->wext.default_key = idx;
@@ -561,7 +562,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 		if (wdev->connected ||
 		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
 		     wdev->u.ibss.current_bss))
-			err = rdev_set_default_mgmt_key(rdev, dev, idx);
+			err = rdev_set_default_mgmt_key(rdev, dev, -1, idx);
 		if (!err)
 			wdev->wext.default_mgmt_key = idx;
 		return err;
@@ -632,7 +633,7 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
 		if (wdev->connected ||
 		    (wdev->iftype == NL80211_IFTYPE_ADHOC &&
 		     wdev->u.ibss.current_bss))
-			err = rdev_set_default_key(rdev, dev, idx, true,
+			err = rdev_set_default_key(rdev, dev, -1, idx, true,
 						   true);
 		if (!err)
 			wdev->wext.default_key = idx;
-- 
cgit v1.2.3


From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Wed, 24 Aug 2022 16:31:13 -0700
Subject: bpf: Introduce cgroup iter

Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes:

 - walking a cgroup's descendants in pre-order.
 - walking a cgroup's descendants in post-order.
 - walking a cgroup's ancestors.
 - process only the given cgroup.

When attaching cgroup_iter, one can set a cgroup to the iter_link
created from attaching. This cgroup is passed as a file descriptor
or cgroup id and serves as the starting point of the walk. If no
cgroup is specified, the starting point will be the root cgroup v2.

For walking descendants, one can specify the order: either pre-order or
post-order. For walking ancestors, the walk starts at the specified
cgroup and ends at the root.

One can also terminate the walk early by returning 1 from the iter
program.

Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
program is called with cgroup_mutex held.

Currently only one session is supported, which means, depending on the
volume of data bpf program intends to send to user space, the number
of cgroups that can be walked is limited. For example, given the current
buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
be walked is 512. This is a limitation of cgroup_iter. If the output
data is larger than the kernel buffer size, after all data in the
kernel buffer is consumed by user space, the subsequent read() syscall
will signal EOPNOTSUPP. In order to work around, the user may have to
update their program to reduce the volume of data sent to output. For
example, skip some uninteresting cgroups. In future, we may extend
bpf_iter flags to allow customizing buffer size.

Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Hao Luo <haoluo@google.com>
Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                               |   8 +
 include/uapi/linux/bpf.h                          |  30 +++
 kernel/bpf/Makefile                               |   3 +
 kernel/bpf/cgroup_iter.c                          | 284 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h                    |  30 +++
 tools/testing/selftests/bpf/prog_tests/btf_dump.c |   4 +-
 6 files changed, 357 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/cgroup_iter.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 99fc7a64564f..9c1674973e03 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -48,6 +48,7 @@ struct mem_cgroup;
 struct module;
 struct bpf_func_state;
 struct ftrace_ops;
+struct cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
+	/* for map_elem iter */
 	struct bpf_map *map;
+
+	/* for cgroup iter */
+	struct {
+		struct cgroup *start; /* starting cgroup */
+		enum bpf_cgroup_iter_order order;
+	} cgroup;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 644600dbb114..0f61f09f467a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_cgroup_iter_order {
+	BPF_ITER_ORDER_UNSPEC = 0,
+	BPF_ITER_SELF_ONLY,		/* process only a single object. */
+	BPF_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
+	BPF_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
+	BPF_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	struct {
+		enum bpf_cgroup_iter_order order;
+
+		/* At most one of cgroup_fd and cgroup_id can be non-zero. If
+		 * both are zero, the walk starts from the default cgroup v2
+		 * root. For walking v1 hierarchy, one should always explicitly
+		 * specify cgroup_fd.
+		 */
+		__u32	cgroup_fd;
+		__u64	cgroup_id;
+	} cgroup;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6176,11 +6195,22 @@ struct bpf_link_info {
 		struct {
 			__aligned_u64 target_name; /* in/out: target_name buffer ptr */
 			__u32 target_name_len;	   /* in/out: target_name buffer len */
+
+			/* If the iter specific field is 32 bits, it can be put
+			 * in the first or second union. Otherwise it should be
+			 * put in the second union.
+			 */
 			union {
 				struct {
 					__u32 map_id;
 				} map;
 			};
+			union {
+				struct {
+					__u64 cgroup_id;
+					__u32 order;
+				} cgroup;
+			};
 		} iter;
 		struct  {
 			__u32 netns_ino;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..00e05b69a4df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -24,6 +24,9 @@ endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
 ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..cf6d763a57d5
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ *  1. Walk the descendants of a cgroup in pre-order.
+ *  2. Walk the descendants of a cgroup in post-order.
+ *  3. Walk the ancestors of a cgroup.
+ *  4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+	struct cgroup_subsys_state *start_css;
+	bool visited_all;
+	bool terminate;
+	int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct cgroup_iter_priv *p = seq->private;
+
+	mutex_lock(&cgroup_mutex);
+
+	/* cgroup_iter doesn't support read across multiple sessions. */
+	if (*pos > 0) {
+		if (p->visited_all)
+			return NULL;
+
+		/* Haven't visited all, but because cgroup_mutex has dropped,
+		 * return -EOPNOTSUPP to indicate incomplete iteration.
+		 */
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	++*pos;
+	p->terminate = false;
+	p->visited_all = false;
+	if (p->order == BPF_ITER_DESCENDANTS_PRE)
+		return css_next_descendant_pre(NULL, p->start_css);
+	else if (p->order == BPF_ITER_DESCENDANTS_POST)
+		return css_next_descendant_post(NULL, p->start_css);
+	else if (p->order == BPF_ITER_ANCESTORS_UP)
+		return p->start_css;
+	else /* BPF_ITER_SELF_ONLY */
+		return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+				  struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+	struct cgroup_iter_priv *p = seq->private;
+
+	mutex_unlock(&cgroup_mutex);
+
+	/* pass NULL to the prog for post-processing */
+	if (!v) {
+		__cgroup_iter_seq_show(seq, NULL, true);
+		p->visited_all = true;
+	}
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+	struct cgroup_iter_priv *p = seq->private;
+
+	++*pos;
+	if (p->terminate)
+		return NULL;
+
+	if (p->order == BPF_ITER_DESCENDANTS_PRE)
+		return css_next_descendant_pre(curr, p->start_css);
+	else if (p->order == BPF_ITER_DESCENDANTS_POST)
+		return css_next_descendant_post(curr, p->start_css);
+	else if (p->order == BPF_ITER_ANCESTORS_UP)
+		return curr->parent;
+	else  /* BPF_ITER_SELF_ONLY */
+		return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+				  struct cgroup_subsys_state *css, int in_stop)
+{
+	struct cgroup_iter_priv *p = seq->private;
+	struct bpf_iter__cgroup ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	/* cgroup is dead, skip this element */
+	if (css && cgroup_is_dead(css->cgroup))
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.cgroup = css ? css->cgroup : NULL;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	/* if prog returns > 0, terminate after this element. */
+	if (ret != 0)
+		p->terminate = true;
+
+	return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+	return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+				      false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+	.start  = cgroup_iter_seq_start,
+	.next   = cgroup_iter_seq_next,
+	.stop   = cgroup_iter_seq_stop,
+	.show   = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+	struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+	struct cgroup *cgrp = aux->cgroup.start;
+
+	p->start_css = &cgrp->self;
+	p->terminate = false;
+	p->visited_all = false;
+	p->order = aux->cgroup.order;
+	return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+	.seq_ops		= &cgroup_iter_seq_ops,
+	.init_seq_private	= cgroup_iter_seq_init,
+	.seq_priv_size		= sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+				  union bpf_iter_link_info *linfo,
+				  struct bpf_iter_aux_info *aux)
+{
+	int fd = linfo->cgroup.cgroup_fd;
+	u64 id = linfo->cgroup.cgroup_id;
+	int order = linfo->cgroup.order;
+	struct cgroup *cgrp;
+
+	if (order != BPF_ITER_DESCENDANTS_PRE &&
+	    order != BPF_ITER_DESCENDANTS_POST &&
+	    order != BPF_ITER_ANCESTORS_UP &&
+	    order != BPF_ITER_SELF_ONLY)
+		return -EINVAL;
+
+	if (fd && id)
+		return -EINVAL;
+
+	if (fd)
+		cgrp = cgroup_get_from_fd(fd);
+	else if (id)
+		cgrp = cgroup_get_from_id(id);
+	else /* walk the entire hierarchy by default. */
+		cgrp = cgroup_get_from_path("/");
+
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	aux->cgroup.start = cgrp;
+	aux->cgroup.order = order;
+	return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+	cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+					struct seq_file *seq)
+{
+	char *buf;
+
+	buf = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf) {
+		seq_puts(seq, "cgroup_path:\t<unknown>\n");
+		goto show_order;
+	}
+
+	/* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+	 * will print nothing.
+	 *
+	 * Path is in the calling process's cgroup namespace.
+	 */
+	cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+		       current->nsproxy->cgroup_ns);
+	seq_printf(seq, "cgroup_path:\t%s\n", buf);
+	kfree(buf);
+
+show_order:
+	if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE)
+		seq_puts(seq, "order: descendants_pre\n");
+	else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST)
+		seq_puts(seq, "order: descendants_post\n");
+	else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP)
+		seq_puts(seq, "order: ancestors_up\n");
+	else /* BPF_ITER_SELF_ONLY */
+		seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+					  struct bpf_link_info *info)
+{
+	info->iter.cgroup.order = aux->cgroup.order;
+	info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+	return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+		     struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+	.target			= "cgroup",
+	.feature		= BPF_ITER_RESCHED,
+	.attach_target		= bpf_iter_attach_cgroup,
+	.detach_target		= bpf_iter_detach_cgroup,
+	.show_fdinfo		= bpf_iter_cgroup_show_fdinfo,
+	.fill_link_info		= bpf_iter_cgroup_fill_link_info,
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__cgroup, cgroup),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+	bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+	return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4fb685591035..5056cef2112f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_cgroup_iter_order {
+	BPF_ITER_ORDER_UNSPEC = 0,
+	BPF_ITER_SELF_ONLY,		/* process only a single object. */
+	BPF_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
+	BPF_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
+	BPF_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	struct {
+		enum bpf_cgroup_iter_order order;
+
+		/* At most one of cgroup_fd and cgroup_id can be non-zero. If
+		 * both are zero, the walk starts from the default cgroup v2
+		 * root. For walking v1 hierarchy, one should always explicitly
+		 * specify cgroup_fd.
+		 */
+		__u32	cgroup_fd;
+		__u64	cgroup_id;
+	} cgroup;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6176,11 +6195,22 @@ struct bpf_link_info {
 		struct {
 			__aligned_u64 target_name; /* in/out: target_name buffer ptr */
 			__u32 target_name_len;	   /* in/out: target_name buffer len */
+
+			/* If the iter specific field is 32 bits, it can be put
+			 * in the first or second union. Otherwise it should be
+			 * put in the second union.
+			 */
 			union {
 				struct {
 					__u32 map_id;
 				} map;
 			};
+			union {
+				struct {
+					__u64 cgroup_id;
+					__u32 order;
+				} cgroup;
+			};
 		} iter;
 		struct  {
 			__u32 netns_ino;
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 5fce7008d1ff..84c1cfaa2b02 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -764,8 +764,8 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 
 	/* union with nested struct */
 	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
-			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}",
-			   { .map = { .map_fd = 1 }});
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (__u32)1,.cgroup_fd = (__u32)1,},}",
+			   { .cgroup = { .order = 1, .cgroup_fd = 1, }});
 
 	/* struct skb with nested structs/unions; because type output is so
 	 * complex, we don't do a string comparison, just verify we return
-- 
cgit v1.2.3


From d4ffb6f39f1a1b260966b43a4ffdb64779c650dd Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 25 Aug 2022 15:39:36 -0700
Subject: bpf: Add CGROUP prefix to cgroup_iter_order

bpf_cgroup_iter_order is globally visible but the entries do not have
CGROUP prefix. As requested by Andrii, put a CGROUP in the names
in bpf_cgroup_iter_order.

This patch fixes two previous commits: one introduced the API and
the other uses the API in bpf selftest (that is, the selftest
cgroup_hierarchical_stats).

I tested this patch via the following command:

  test_progs -t cgroup,iter,btf_dump

Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
Fixes: 88886309d2e8 ("selftests/bpf: add a selftest for cgroup hierarchical stats collection")
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Hao Luo <haoluo@google.com>
Link: https://lore.kernel.org/r/20220825223936.1865810-1-haoluo@google.com
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/uapi/linux/bpf.h                           | 10 +++----
 kernel/bpf/cgroup_iter.c                           | 32 +++++++++++-----------
 tools/include/uapi/linux/bpf.h                     | 10 +++----
 tools/testing/selftests/bpf/prog_tests/btf_dump.c  |  2 +-
 .../bpf/prog_tests/cgroup_hierarchical_stats.c     |  2 +-
 .../testing/selftests/bpf/prog_tests/cgroup_iter.c | 10 +++----
 6 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0f61f09f467a..bdf4bc6d8d6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key {
 };
 
 enum bpf_cgroup_iter_order {
-	BPF_ITER_ORDER_UNSPEC = 0,
-	BPF_ITER_SELF_ONLY,		/* process only a single object. */
-	BPF_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
-	BPF_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
-	BPF_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	BPF_CGROUP_ITER_ORDER_UNSPEC = 0,
+	BPF_CGROUP_ITER_SELF_ONLY,		/* process only a single object. */
+	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
+	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
+	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
 };
 
 union bpf_iter_link_info {
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index cf6d763a57d5..c69bce2f4403 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -74,13 +74,13 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
 	++*pos;
 	p->terminate = false;
 	p->visited_all = false;
-	if (p->order == BPF_ITER_DESCENDANTS_PRE)
+	if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
 		return css_next_descendant_pre(NULL, p->start_css);
-	else if (p->order == BPF_ITER_DESCENDANTS_POST)
+	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(NULL, p->start_css);
-	else if (p->order == BPF_ITER_ANCESTORS_UP)
+	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		return p->start_css;
-	else /* BPF_ITER_SELF_ONLY */
+	else /* BPF_CGROUP_ITER_SELF_ONLY */
 		return p->start_css;
 }
 
@@ -109,13 +109,13 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (p->terminate)
 		return NULL;
 
-	if (p->order == BPF_ITER_DESCENDANTS_PRE)
+	if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
 		return css_next_descendant_pre(curr, p->start_css);
-	else if (p->order == BPF_ITER_DESCENDANTS_POST)
+	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(curr, p->start_css);
-	else if (p->order == BPF_ITER_ANCESTORS_UP)
+	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		return curr->parent;
-	else  /* BPF_ITER_SELF_ONLY */
+	else  /* BPF_CGROUP_ITER_SELF_ONLY */
 		return NULL;
 }
 
@@ -188,10 +188,10 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
 	int order = linfo->cgroup.order;
 	struct cgroup *cgrp;
 
-	if (order != BPF_ITER_DESCENDANTS_PRE &&
-	    order != BPF_ITER_DESCENDANTS_POST &&
-	    order != BPF_ITER_ANCESTORS_UP &&
-	    order != BPF_ITER_SELF_ONLY)
+	if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+	    order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+	    order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+	    order != BPF_CGROUP_ITER_SELF_ONLY)
 		return -EINVAL;
 
 	if (fd && id)
@@ -239,13 +239,13 @@ static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
 	kfree(buf);
 
 show_order:
-	if (aux->cgroup.order == BPF_ITER_DESCENDANTS_PRE)
+	if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
 		seq_puts(seq, "order: descendants_pre\n");
-	else if (aux->cgroup.order == BPF_ITER_DESCENDANTS_POST)
+	else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		seq_puts(seq, "order: descendants_post\n");
-	else if (aux->cgroup.order == BPF_ITER_ANCESTORS_UP)
+	else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		seq_puts(seq, "order: ancestors_up\n");
-	else /* BPF_ITER_SELF_ONLY */
+	else /* BPF_CGROUP_ITER_SELF_ONLY */
 		seq_puts(seq, "order: self_only\n");
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5056cef2112f..92f7387e378a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key {
 };
 
 enum bpf_cgroup_iter_order {
-	BPF_ITER_ORDER_UNSPEC = 0,
-	BPF_ITER_SELF_ONLY,		/* process only a single object. */
-	BPF_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
-	BPF_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
-	BPF_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	BPF_CGROUP_ITER_ORDER_UNSPEC = 0,
+	BPF_CGROUP_ITER_SELF_ONLY,		/* process only a single object. */
+	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
+	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
+	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
 };
 
 union bpf_iter_link_info {
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index a1bae92be1fc..7b5bbe21b549 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 
 	/* union with nested struct */
 	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
-			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}",
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}",
 			   { .cgroup = { .order = 1, .cgroup_fd = 1, }});
 
 	/* struct skb with nested structs/unions; because type output is so
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
index 101a6d70b863..bed1661596f7 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
@@ -275,7 +275,7 @@ static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj,
 	 * traverse one cgroup, so set the traversal order to "self".
 	 */
 	linfo.cgroup.cgroup_fd = cgroup_fd;
-	linfo.cgroup.order = BPF_ITER_SELF_ONLY;
+	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
 	link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts);
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
index 38958c37b9ce..c4a2adb38da1 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
@@ -134,7 +134,7 @@ static void test_walk_preorder(struct cgroup_iter *skel)
 		 cg_id[PARENT], cg_id[CHILD1], cg_id[CHILD2]);
 
 	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
-			      BPF_ITER_DESCENDANTS_PRE, "preorder");
+			      BPF_CGROUP_ITER_DESCENDANTS_PRE, "preorder");
 }
 
 /* Postorder walk prints child and parent in order. */
@@ -145,7 +145,7 @@ static void test_walk_postorder(struct cgroup_iter *skel)
 		 cg_id[CHILD1], cg_id[CHILD2], cg_id[PARENT]);
 
 	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
-			      BPF_ITER_DESCENDANTS_POST, "postorder");
+			      BPF_CGROUP_ITER_DESCENDANTS_POST, "postorder");
 }
 
 /* Walking parents prints parent and then root. */
@@ -159,7 +159,7 @@ static void test_walk_ancestors_up(struct cgroup_iter *skel)
 		 cg_id[PARENT], cg_id[ROOT]);
 
 	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
-			      BPF_ITER_ANCESTORS_UP, "ancestors_up");
+			      BPF_CGROUP_ITER_ANCESTORS_UP, "ancestors_up");
 
 	skel->bss->terminal_cgroup = 0;
 }
@@ -174,7 +174,7 @@ static void test_early_termination(struct cgroup_iter *skel)
 		 PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]);
 
 	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
-			      BPF_ITER_DESCENDANTS_PRE, "early_termination");
+			      BPF_CGROUP_ITER_DESCENDANTS_PRE, "early_termination");
 
 	skel->bss->terminate_early = 0;
 }
@@ -186,7 +186,7 @@ static void test_walk_self_only(struct cgroup_iter *skel)
 		 PROLOGUE "%8llu\n" EPILOGUE, cg_id[PARENT]);
 
 	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
-			      BPF_ITER_SELF_ONLY, "self_only");
+			      BPF_CGROUP_ITER_SELF_ONLY, "self_only");
 }
 
 void test_cgroup_iter(void)
-- 
cgit v1.2.3


From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 11 Aug 2022 18:00:00 +0530
Subject: perf/core: Add speculation info to branch entries

Add a new "spec" bitfield to branch entries for providing speculation
information. This will be populated using hints provided by branch sampling
features on supported hardware. The following cases are covered:

  * No branch speculation information is available
  * Branch is speculative but taken on the wrong path
  * Branch is non-speculative but taken on the correct path
  * Branch is speculative and taken on the correct path

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee8b9ecdc03b..ae30c61957d2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b
 	br->abort = 0;
 	br->cycles = 0;
 	br->type = 0;
+	br->spec = PERF_BR_SPEC_NA;
 	br->reserved = 0;
 }
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 03b370062741..30a4723aefd4 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -256,6 +256,17 @@ enum {
 	PERF_BR_MAX,
 };
 
+/*
+ * Common branch speculation outcome classification
+ */
+enum {
+	PERF_BR_SPEC_NA			= 0,	/* Not available */
+	PERF_BR_SPEC_WRONG_PATH		= 1,	/* Speculative but on wrong path */
+	PERF_BR_NON_SPEC_CORRECT_PATH	= 2,	/* Non-speculative but on correct path */
+	PERF_BR_SPEC_CORRECT_PATH	= 3,	/* Speculative and on correct path */
+	PERF_BR_SPEC_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1363,6 +1374,7 @@ union perf_mem_data_src {
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
  *      type: branch type
+ *      spec: branch speculation info (or 0 if not supported)
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1373,7 +1385,8 @@ struct perf_branch_entry {
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
 		type:4,     /* branch type */
-		reserved:40;
+		spec:2,     /* branch speculation info */
+		reserved:38;
 };
 
 union perf_sample_weight {
-- 
cgit v1.2.3


From 54c4ef34c4b6f9720fded620e2893894f9f2c554 Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 25 Aug 2022 05:04:49 +0300
Subject: openvswitch: allow specifying ifindex of new interfaces

CRIU is preserving ifindexes of net devices after restoration. However,
current Open vSwitch API does not allow to target ifindex, so we cannot
correctly restore OVS configuration.

Add new OVS_DP_ATTR_IFINDEX for OVS_DP_CMD_NEW and use it as desired
ifindex.
Use OVS_VPORT_ATTR_IFINDEX during OVS_VPORT_CMD_NEW to specify new netdev
ifindex.

Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/openvswitch.h     |  3 +++
 net/openvswitch/datapath.c           | 11 +++++++++--
 net/openvswitch/vport-internal_dev.c |  1 +
 net/openvswitch/vport.h              |  2 ++
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ce3e1738d427..94066f87e9ee 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -76,6 +76,8 @@ enum ovs_datapath_cmd {
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
  * datapath. Always present in notifications.
+ * @OVS_DP_ATTR_IFINDEX: Interface index for a new datapath netdev. Only
+ * valid for %OVS_DP_CMD_NEW requests.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_DP_* commands.
@@ -92,6 +94,7 @@ enum ovs_datapath_attr {
 	OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in
 				     * per-cpu dispatch mode
 				     */
+	OVS_DP_ATTR_IFINDEX,
 	__OVS_DP_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 45f9a7b3410e..1ad771d39d1e 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1787,6 +1787,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	parms.dp = dp;
 	parms.port_no = OVSP_LOCAL;
 	parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
+	parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX]
+		? nla_get_u32(a[OVS_DP_ATTR_IFINDEX]) : 0;
 
 	/* So far only local changes have been made, now need the lock. */
 	ovs_lock();
@@ -2004,6 +2006,7 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
 	[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
 	[OVS_DP_ATTR_MASKS_CACHE_SIZE] =  NLA_POLICY_RANGE(NLA_U32, 0,
 		PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)),
+	[OVS_DP_ATTR_IFINDEX] = {.type = NLA_U32 },
 };
 
 static const struct genl_small_ops dp_datapath_genl_ops[] = {
@@ -2207,7 +2210,10 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
 	    !a[OVS_VPORT_ATTR_UPCALL_PID])
 		return -EINVAL;
-	if (a[OVS_VPORT_ATTR_IFINDEX])
+
+	parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+
+	if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL)
 		return -EOPNOTSUPP;
 
 	port_no = a[OVS_VPORT_ATTR_PORT_NO]
@@ -2244,11 +2250,12 @@ restart:
 	}
 
 	parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
-	parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
 	parms.options = a[OVS_VPORT_ATTR_OPTIONS];
 	parms.dp = dp;
 	parms.port_no = port_no;
 	parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
+	parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX]
+		? nla_get_u32(a[OVS_VPORT_ATTR_IFINDEX]) : 0;
 
 	vport = new_vport(&parms);
 	err = PTR_ERR(vport);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 134bc8503461..35f42c9821c2 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -147,6 +147,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 	}
 
 	dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+	dev->ifindex = parms->desired_ifindex;
 	internal_dev = internal_dev_priv(vport->dev);
 	internal_dev->vport = vport;
 
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 9de5030d9801..7d276f60c000 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -90,12 +90,14 @@ struct vport {
  * @type: New vport's type.
  * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
  * none was supplied.
+ * @desired_ifindex: New vport's ifindex.
  * @dp: New vport's datapath.
  * @port_no: New vport's port number.
  */
 struct vport_parms {
 	const char *name;
 	enum ovs_vport_type type;
+	int desired_ifindex;
 	struct nlattr *options;
 
 	/* For ovs_vport_alloc(). */
-- 
cgit v1.2.3


From aa75622c3be4d5819ce69c714acbcbd67bba5d65 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 25 Aug 2022 23:08:06 +0100
Subject: bpf: Fix a few typos in BPF helpers documentation

Address a few typos in the documentation for the BPF helper functions.
They were reported by Jakub [0], who ran spell checkers on the generated
man page [1].

[0] https://lore.kernel.org/linux-man/d22dcd47-023c-8f52-d369-7b5308e6c842@gmail.com/T/#mb02e7d4b7fb61d98fa914c77b581184e9a9537af
[1] https://lore.kernel.org/linux-man/eb6a1e41-c48e-ac45-5154-ac57a2c76108@gmail.com/T/#m4a8d1b003616928013ffcd1450437309ab652f9f

v3: Do not copy unrelated (and breaking) elements to tools/ header
v2: Turn a ',' into a ';'

Reported-by: Jakub Wilk <jwilk@jwilk.net>
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220825220806.107143-1-quentin@isovalent.com
---
 include/uapi/linux/bpf.h       | 16 ++++++++--------
 tools/include/uapi/linux/bpf.h | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bdf4bc6d8d6b..962960a98835 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4456,7 +4456,7 @@ union bpf_attr {
  *
  *		**-EEXIST** if the option already exists.
  *
- *		**-EFAULT** on failrue to parse the existing header options.
+ *		**-EFAULT** on failure to parse the existing header options.
  *
  *		**-EPERM** if the helper cannot be used under the current
  *		*skops*\ **->op**.
@@ -4665,7 +4665,7 @@ union bpf_attr {
  *		a *map* with *task* as the **key**.  From this
  *		perspective,  the usage is not much different from
  *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
- *		helper enforces the key must be an task_struct and the map must also
+ *		helper enforces the key must be a task_struct and the map must also
  *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
  *
  *		Underneath, the value is stored locally at *task* instead of
@@ -4723,7 +4723,7 @@ union bpf_attr {
  *
  * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
  *	Description
- *		Returns the stored IMA hash of the *inode* (if it's avaialable).
+ *		Returns the stored IMA hash of the *inode* (if it's available).
  *		If the hash is larger than *size*, then only *size*
  *		bytes will be copied to *dst*
  *	Return
@@ -4747,12 +4747,12 @@ union bpf_attr {
  *
  *		The argument *len_diff* can be used for querying with a planned
  *		size change. This allows to check MTU prior to changing packet
- *		ctx. Providing an *len_diff* adjustment that is larger than the
+ *		ctx. Providing a *len_diff* adjustment that is larger than the
  *		actual packet size (resulting in negative packet size) will in
- *		principle not exceed the MTU, why it is not considered a
- *		failure.  Other BPF-helpers are needed for performing the
- *		planned size change, why the responsability for catch a negative
- *		packet size belong in those helpers.
+ *		principle not exceed the MTU, which is why it is not considered
+ *		a failure.  Other BPF helpers are needed for performing the
+ *		planned size change; therefore the responsibility for catching
+ *		a negative packet size belongs in those helpers.
  *
  *		Specifying *ifindex* zero means the MTU check is performed
  *		against the current net device.  This is practical if this isn't
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 92f7387e378a..f4ba82a1eace 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4456,7 +4456,7 @@ union bpf_attr {
  *
  *		**-EEXIST** if the option already exists.
  *
- *		**-EFAULT** on failrue to parse the existing header options.
+ *		**-EFAULT** on failure to parse the existing header options.
  *
  *		**-EPERM** if the helper cannot be used under the current
  *		*skops*\ **->op**.
@@ -4665,7 +4665,7 @@ union bpf_attr {
  *		a *map* with *task* as the **key**.  From this
  *		perspective,  the usage is not much different from
  *		**bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
- *		helper enforces the key must be an task_struct and the map must also
+ *		helper enforces the key must be a task_struct and the map must also
  *		be a **BPF_MAP_TYPE_TASK_STORAGE**.
  *
  *		Underneath, the value is stored locally at *task* instead of
@@ -4723,7 +4723,7 @@ union bpf_attr {
  *
  * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
  *	Description
- *		Returns the stored IMA hash of the *inode* (if it's avaialable).
+ *		Returns the stored IMA hash of the *inode* (if it's available).
  *		If the hash is larger than *size*, then only *size*
  *		bytes will be copied to *dst*
  *	Return
@@ -4747,12 +4747,12 @@ union bpf_attr {
  *
  *		The argument *len_diff* can be used for querying with a planned
  *		size change. This allows to check MTU prior to changing packet
- *		ctx. Providing an *len_diff* adjustment that is larger than the
+ *		ctx. Providing a *len_diff* adjustment that is larger than the
  *		actual packet size (resulting in negative packet size) will in
- *		principle not exceed the MTU, why it is not considered a
- *		failure.  Other BPF-helpers are needed for performing the
- *		planned size change, why the responsability for catch a negative
- *		packet size belong in those helpers.
+ *		principle not exceed the MTU, which is why it is not considered
+ *		a failure.  Other BPF helpers are needed for performing the
+ *		planned size change; therefore the responsibility for catching
+ *		a negative packet size belongs in those helpers.
  *
  *		Specifying *ifindex* zero means the MTU check is performed
  *		against the current net device.  This is practical if this isn't
-- 
cgit v1.2.3


From a724ec82966d57e4b5d36341d3e3dc1a3c011564 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 24 Aug 2022 10:18:15 +0530
Subject: perf: Add system error and not in transaction branch types

This expands generic branch type classification by adding two more entries
there in i.e system error and not in transaction. This also updates the x86
implementation to process X86_BR_NO_TX records as appropriate. This changes
branch types reported to user space on x86 platform but it should not be a
problem. The possible scenarios and impacts are enumerated here.

 --------------------------------------------------------------------------
 | kernel | perf tool |                     Impact                        |
 --------------------------------------------------------------------------
 |   old  |    old    |  Works as before                                  |
 --------------------------------------------------------------------------
 |   old  |    new    |  PERF_BR_UNKNOWN is processed                     |
 --------------------------------------------------------------------------
 |   new  |    old    |  PERF_BR_NO_TX is blocked via old PERF_BR_MAX     |
 --------------------------------------------------------------------------
 |   new  |    new    |  PERF_BR_NO_TX is recognized                      |
 --------------------------------------------------------------------------

When PERF_BR_NO_TX is blocked via old PERF_BR_MAX (new kernel with old perf
tool) the user space might throw up an warning complaining about an
unrecognized branch types being reported, but it's expected. PERF_BR_SERROR
& PERF_BR_NO_TX branch types will be used for BRBE implementation on arm64
platform.

PERF_BR_NO_TX complements 'abort' and 'in_tx' elements in perf_branch_entry
which represent other transaction states for a given branch record. Because
this completes the transaction state classification.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220824044822.70230-2-anshuman.khandual@arm.com
---
 arch/x86/events/utils.c         | 2 +-
 include/uapi/linux/perf_event.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c
index e013243f360c..5f5617afde79 100644
--- a/arch/x86/events/utils.c
+++ b/arch/x86/events/utils.c
@@ -225,7 +225,7 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = {
 	PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
 	PERF_BR_UNKNOWN,	/* X86_BR_ABORT */
 	PERF_BR_UNKNOWN,	/* X86_BR_IN_TX */
-	PERF_BR_UNKNOWN,	/* X86_BR_NO_TX */
+	PERF_BR_NO_TX,		/* X86_BR_NO_TX */
 	PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
 	PERF_BR_UNKNOWN,	/* X86_BR_CALL_STACK */
 	PERF_BR_IND,		/* X86_BR_IND_JMP */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 30a4723aefd4..a79cc0eb4de7 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -253,6 +253,8 @@ enum {
 	PERF_BR_COND_RET	= 10,	/* conditional function return */
 	PERF_BR_ERET		= 11,	/* exception return */
 	PERF_BR_IRQ		= 12,	/* irq */
+	PERF_BR_SERROR		= 13,	/* system error */
+	PERF_BR_NO_TX		= 14,	/* not in transaction */
 	PERF_BR_MAX,
 };
 
-- 
cgit v1.2.3


From b190bc4ac9e6d9763b61654c5a0c085ff77d7a09 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 24 Aug 2022 10:18:16 +0530
Subject: perf: Extend branch type classification

branch_entry.type now has ran out of space to accommodate more branch types
classification. This will prevent perf branch stack implementation on arm64
(via BRBE) to capture all available branch types. Extending this bit field
i.e branch_entry.type [4 bits] is not an option as it will break user space
ABI both for little and big endian perf tools.

Extend branch classification with a new field branch_entry.new_type via a
new branch type PERF_BR_EXTEND_ABI in branch_entry.type. Perf tools which
could decode PERF_BR_EXTEND_ABI, will then parse branch_entry.new_type as
well.

branch_entry.new_type is a 4 bit field which can hold upto 16 branch types.
The first three branch types will hold various generic page faults followed
by five architecture specific branch types, which can be overridden by the
platform for specific use cases. These architecture specific branch types
gets overridden on arm64 platform for BRBE implementation.

New generic branch types

 - PERF_BR_NEW_FAULT_ALGN
 - PERF_BR_NEW_FAULT_DATA
 - PERF_BR_NEW_FAULT_INST

New arch specific branch types

 - PERF_BR_NEW_ARCH_1
 - PERF_BR_NEW_ARCH_2
 - PERF_BR_NEW_ARCH_3
 - PERF_BR_NEW_ARCH_4
 - PERF_BR_NEW_ARCH_5

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220824044822.70230-3-anshuman.khandual@arm.com
---
 include/uapi/linux/perf_event.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index a79cc0eb4de7..fed60e6b10e5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -255,6 +255,7 @@ enum {
 	PERF_BR_IRQ		= 12,	/* irq */
 	PERF_BR_SERROR		= 13,	/* system error */
 	PERF_BR_NO_TX		= 14,	/* not in transaction */
+	PERF_BR_EXTEND_ABI	= 15,	/* extend ABI */
 	PERF_BR_MAX,
 };
 
@@ -269,6 +270,18 @@ enum {
 	PERF_BR_SPEC_MAX,
 };
 
+enum {
+	PERF_BR_NEW_FAULT_ALGN		= 0,    /* Alignment fault */
+	PERF_BR_NEW_FAULT_DATA		= 1,    /* Data fault */
+	PERF_BR_NEW_FAULT_INST		= 2,    /* Inst fault */
+	PERF_BR_NEW_ARCH_1		= 3,    /* Architecture specific */
+	PERF_BR_NEW_ARCH_2		= 4,    /* Architecture specific */
+	PERF_BR_NEW_ARCH_3		= 5,    /* Architecture specific */
+	PERF_BR_NEW_ARCH_4		= 6,    /* Architecture specific */
+	PERF_BR_NEW_ARCH_5		= 7,    /* Architecture specific */
+	PERF_BR_NEW_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1388,7 +1401,8 @@ struct perf_branch_entry {
 		cycles:16,  /* cycle count to last branch */
 		type:4,     /* branch type */
 		spec:2,     /* branch speculation info */
-		reserved:38;
+		new_type:4, /* additional branch type */
+		reserved:34;
 };
 
 union perf_sample_weight {
-- 
cgit v1.2.3


From 5402d25aa5710d240040f73fb13d7d5c303ef071 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 24 Aug 2022 10:18:17 +0530
Subject: perf: Capture branch privilege information

Platforms like arm64 could capture privilege level information for all the
branch records. Hence this adds a new element in the struct branch_entry to
record the privilege level information, which could be requested through a
new event.attr.branch_sample_type based flag PERF_SAMPLE_BRANCH_PRIV_SAVE.
This flag helps user choose whether privilege information is captured.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220824044822.70230-4-anshuman.khandual@arm.com
---
 include/uapi/linux/perf_event.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index fed60e6b10e5..1a258d45a3fa 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift {
 
 	PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT	= 17, /* save low level index of raw branch records */
 
+	PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT	= 18, /* save privilege mode */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -233,6 +235,8 @@ enum perf_branch_sample_type {
 
 	PERF_SAMPLE_BRANCH_HW_INDEX	= 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT,
 
+	PERF_SAMPLE_BRANCH_PRIV_SAVE	= 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
@@ -282,6 +286,13 @@ enum {
 	PERF_BR_NEW_MAX,
 };
 
+enum {
+	PERF_BR_PRIV_UNKNOWN	= 0,
+	PERF_BR_PRIV_USER	= 1,
+	PERF_BR_PRIV_KERNEL	= 2,
+	PERF_BR_PRIV_HV		= 3,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1402,7 +1413,8 @@ struct perf_branch_entry {
 		type:4,     /* branch type */
 		spec:2,     /* branch speculation info */
 		new_type:4, /* additional branch type */
-		reserved:34;
+		priv:3,     /* privilege level */
+		reserved:31;
 };
 
 union perf_sample_weight {
-- 
cgit v1.2.3


From f4054e522531038354bea5c924f286fdd8ae77b5 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 24 Aug 2022 10:18:18 +0530
Subject: perf: Add PERF_BR_NEW_ARCH_[N] map for BRBE on arm64 platform

BRBE captured branch types will overflow perf_branch_entry.type and generic
branch types in perf_branch_entry.new_type. So override each available arch
specific branch type in the following manner to comprehensively process all
reported branch types in BRBE.

  PERF_BR_ARM64_FIQ            PERF_BR_NEW_ARCH_1
  PERF_BR_ARM64_DEBUG_HALT     PERF_BR_NEW_ARCH_2
  PERF_BR_ARM64_DEBUG_EXIT     PERF_BR_NEW_ARCH_3
  PERF_BR_ARM64_DEBUG_INST     PERF_BR_NEW_ARCH_4
  PERF_BR_ARM64_DEBUG_DATA     PERF_BR_NEW_ARCH_5

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20220824044822.70230-5-anshuman.khandual@arm.com
---
 include/uapi/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1a258d45a3fa..dca16582885f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -293,6 +293,12 @@ enum {
 	PERF_BR_PRIV_HV		= 3,
 };
 
+#define PERF_BR_ARM64_FIQ		PERF_BR_NEW_ARCH_1
+#define PERF_BR_ARM64_DEBUG_HALT	PERF_BR_NEW_ARCH_2
+#define PERF_BR_ARM64_DEBUG_EXIT	PERF_BR_NEW_ARCH_3
+#define PERF_BR_ARM64_DEBUG_INST	PERF_BR_NEW_ARCH_4
+#define PERF_BR_ARM64_DEBUG_DATA	PERF_BR_NEW_ARCH_5
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
-- 
cgit v1.2.3


From abc340b38ba25cd6c7aa2c0bd9150d30738c82d0 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 26 Aug 2022 14:46:59 +0300
Subject: xfrm: interface: support collect metadata mode

This commit adds support for 'collect_md' mode on xfrm interfaces.

Each net can have one collect_md device, created by providing the
IFLA_XFRM_COLLECT_METADATA flag at creation. This device cannot be
altered and has no if_id or link device attributes.

On transmit to this device, the if_id is fetched from the attached dst
metadata on the skb. If exists, the link property is also fetched from
the metadata. The dst metadata type used is METADATA_XFRM which holds
these properties.

On the receive side, xfrmi_rcv_cb() populates a dst metadata for each
packet received and attaches it to the skb. The if_id used in this case is
fetched from the xfrm state, and the link is fetched from the incoming
device. This information can later be used by upper layers such as tc,
ebpf, and ip rules.

Because the skb is scrubed in xfrmi_rcv_cb(), the attachment of the dst
metadata is postponed until after scrubing. Similarly, xfrm_input() is
adapted to avoid dropping metadata dsts by only dropping 'valid'
(skb_valid_dst(skb) == true) dsts.

Policy matching on packets arriving from collect_md xfrmi devices is
done by using the xfrm state existing in the skb's sec_path.
The xfrm_if_cb.decode_cb() interface implemented by xfrmi_decode_session()
is changed to keep the details of the if_id extraction tucked away
in xfrm_interface.c.

Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h           |  11 +++-
 include/uapi/linux/if_link.h |   1 +
 net/xfrm/xfrm_input.c        |   7 ++-
 net/xfrm/xfrm_interface.c    | 121 ++++++++++++++++++++++++++++++++++++-------
 net/xfrm/xfrm_policy.c       |  10 ++--
 5 files changed, 121 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6e8fa98f786f..28b988577ed2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -312,9 +312,15 @@ struct km_event {
 	struct net *net;
 };
 
+struct xfrm_if_decode_session_result {
+	struct net *net;
+	u32 if_id;
+};
+
 struct xfrm_if_cb {
-	struct xfrm_if	*(*decode_session)(struct sk_buff *skb,
-					   unsigned short family);
+	bool (*decode_session)(struct sk_buff *skb,
+			       unsigned short family,
+			       struct xfrm_if_decode_session_result *res);
 };
 
 void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
@@ -985,6 +991,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 struct xfrm_if_parms {
 	int link;		/* ifindex of underlying L2 interface */
 	u32 if_id;		/* interface identifyer */
+	bool collect_md;
 };
 
 struct xfrm_if {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e36d9d2c65a7..d96f13a42589 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -694,6 +694,7 @@ enum {
 	IFLA_XFRM_UNSPEC,
 	IFLA_XFRM_LINK,
 	IFLA_XFRM_IF_ID,
+	IFLA_XFRM_COLLECT_METADATA,
 	__IFLA_XFRM_MAX
 };
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 144238a50f3d..25e822fb5771 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -20,6 +20,7 @@
 #include <net/xfrm.h>
 #include <net/ip_tunnels.h>
 #include <net/ip6_tunnel.h>
+#include <net/dst_metadata.h>
 
 #include "xfrm_inout.h"
 
@@ -720,7 +721,8 @@ resume:
 		sp = skb_sec_path(skb);
 		if (sp)
 			sp->olen = 0;
-		skb_dst_drop(skb);
+		if (skb_valid_dst(skb))
+			skb_dst_drop(skb);
 		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
@@ -738,7 +740,8 @@ resume:
 			sp = skb_sec_path(skb);
 			if (sp)
 				sp->olen = 0;
-			skb_dst_drop(skb);
+			if (skb_valid_dst(skb))
+				skb_dst_drop(skb);
 			gro_cells_receive(&gro_cells, skb);
 			return err;
 		}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 5113fa0fbcee..e9a355047468 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -41,6 +41,7 @@
 #include <net/addrconf.h>
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
+#include <net/dst_metadata.h>
 #include <net/netns/generic.h>
 #include <linux/etherdevice.h>
 
@@ -56,6 +57,7 @@ static const struct net_device_ops xfrmi_netdev_ops;
 struct xfrmi_net {
 	/* lists for storing interfaces in use */
 	struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE];
+	struct xfrm_if __rcu *collect_md_xfrmi;
 };
 
 #define for_each_xfrmi_rcu(start, xi) \
@@ -77,17 +79,23 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
 			return xi;
 	}
 
+	xi = rcu_dereference(xfrmn->collect_md_xfrmi);
+	if (xi && (xi->dev->flags & IFF_UP))
+		return xi;
+
 	return NULL;
 }
 
-static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb,
-					    unsigned short family)
+static bool xfrmi_decode_session(struct sk_buff *skb,
+				 unsigned short family,
+				 struct xfrm_if_decode_session_result *res)
 {
 	struct net_device *dev;
+	struct xfrm_if *xi;
 	int ifindex = 0;
 
 	if (!secpath_exists(skb) || !skb->dev)
-		return NULL;
+		return false;
 
 	switch (family) {
 	case AF_INET6:
@@ -107,11 +115,18 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb,
 	}
 
 	if (!dev || !(dev->flags & IFF_UP))
-		return NULL;
+		return false;
 	if (dev->netdev_ops != &xfrmi_netdev_ops)
-		return NULL;
+		return false;
 
-	return netdev_priv(dev);
+	xi = netdev_priv(dev);
+	res->net = xi->net;
+
+	if (xi->p.collect_md)
+		res->if_id = xfrm_input_state(skb)->if_id;
+	else
+		res->if_id = xi->p.if_id;
+	return true;
 }
 
 static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
@@ -157,7 +172,10 @@ static int xfrmi_create(struct net_device *dev)
 	if (err < 0)
 		goto out;
 
-	xfrmi_link(xfrmn, xi);
+	if (xi->p.collect_md)
+		rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi);
+	else
+		xfrmi_link(xfrmn, xi);
 
 	return 0;
 
@@ -185,7 +203,10 @@ static void xfrmi_dev_uninit(struct net_device *dev)
 	struct xfrm_if *xi = netdev_priv(dev);
 	struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
 
-	xfrmi_unlink(xfrmn, xi);
+	if (xi->p.collect_md)
+		RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL);
+	else
+		xfrmi_unlink(xfrmn, xi);
 }
 
 static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
@@ -214,6 +235,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	struct xfrm_state *x;
 	struct xfrm_if *xi;
 	bool xnet;
+	int link;
 
 	if (err && !secpath_exists(skb))
 		return 0;
@@ -224,6 +246,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	if (!xi)
 		return 1;
 
+	link = skb->dev->ifindex;
 	dev = xi->dev;
 	skb->dev = dev;
 
@@ -254,6 +277,17 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	}
 
 	xfrmi_scrub_packet(skb, xnet);
+	if (xi->p.collect_md) {
+		struct metadata_dst *md_dst;
+
+		md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC);
+		if (!md_dst)
+			return -ENOMEM;
+
+		md_dst->u.xfrm_info.if_id = x->if_id;
+		md_dst->u.xfrm_info.link = link;
+		skb_dst_set(skb, (struct dst_entry *)md_dst);
+	}
 	dev_sw_netstats_rx_add(dev, skb->len);
 
 	return 0;
@@ -269,10 +303,23 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 	struct net_device *tdev;
 	struct xfrm_state *x;
 	int err = -1;
+	u32 if_id;
 	int mtu;
 
+	if (xi->p.collect_md) {
+		struct xfrm_md_info *md_info = skb_xfrm_md_info(skb);
+
+		if (unlikely(!md_info))
+			return -EINVAL;
+
+		if_id = md_info->if_id;
+		fl->flowi_oif = md_info->link;
+	} else {
+		if_id = xi->p.if_id;
+	}
+
 	dst_hold(dst);
-	dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
+	dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		dst = NULL;
@@ -283,7 +330,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 	if (!x)
 		goto tx_err_link_failure;
 
-	if (x->if_id != xi->p.if_id)
+	if (x->if_id != if_id)
 		goto tx_err_link_failure;
 
 	tdev = dst->dev;
@@ -633,6 +680,9 @@ static void xfrmi_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_XFRM_IF_ID])
 		parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+
+	if (data[IFLA_XFRM_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
@@ -645,14 +695,27 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
 	int err;
 
 	xfrmi_netlink_parms(data, &p);
-	if (!p.if_id) {
-		NL_SET_ERR_MSG(extack, "if_id must be non zero");
-		return -EINVAL;
-	}
+	if (p.collect_md) {
+		struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
 
-	xi = xfrmi_locate(net, &p);
-	if (xi)
-		return -EEXIST;
+		if (p.link || p.if_id) {
+			NL_SET_ERR_MSG(extack, "link and if_id must be zero");
+			return -EINVAL;
+		}
+
+		if (rtnl_dereference(xfrmn->collect_md_xfrmi))
+			return -EEXIST;
+
+	} else {
+		if (!p.if_id) {
+			NL_SET_ERR_MSG(extack, "if_id must be non zero");
+			return -EINVAL;
+		}
+
+		xi = xfrmi_locate(net, &p);
+		if (xi)
+			return -EEXIST;
+	}
 
 	xi = netdev_priv(dev);
 	xi->p = p;
@@ -682,12 +745,22 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
 		return -EINVAL;
 	}
 
+	if (p.collect_md) {
+		NL_SET_ERR_MSG(extack, "collect_md can't be changed");
+		return -EINVAL;
+	}
+
 	xi = xfrmi_locate(net, &p);
 	if (!xi) {
 		xi = netdev_priv(dev);
 	} else {
 		if (xi->dev != dev)
 			return -EEXIST;
+		if (xi->p.collect_md) {
+			NL_SET_ERR_MSG(extack,
+				       "device can't be changed to collect_md");
+			return -EINVAL;
+		}
 	}
 
 	return xfrmi_update(xi, &p);
@@ -700,6 +773,8 @@ static size_t xfrmi_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_XFRM_IF_ID */
 		nla_total_size(4) +
+		/* IFLA_XFRM_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -709,7 +784,8 @@ static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	struct xfrm_if_parms *parm = &xi->p;
 
 	if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
-	    nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+	    nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) ||
+	    (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA)))
 		goto nla_put_failure;
 	return 0;
 
@@ -725,8 +801,10 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev)
 }
 
 static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
-	[IFLA_XFRM_LINK]	= { .type = NLA_U32 },
-	[IFLA_XFRM_IF_ID]	= { .type = NLA_U32 },
+	[IFLA_XFRM_UNSPEC]		= { .strict_start_type = IFLA_XFRM_COLLECT_METADATA },
+	[IFLA_XFRM_LINK]		= { .type = NLA_U32 },
+	[IFLA_XFRM_IF_ID]		= { .type = NLA_U32 },
+	[IFLA_XFRM_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
@@ -762,6 +840,9 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list)
 			     xip = &xi->next)
 				unregister_netdevice_queue(xi->dev, &list);
 		}
+		xi = rtnl_dereference(xfrmn->collect_md_xfrmi);
+		if (xi)
+			unregister_netdevice_queue(xi->dev, &list);
 	}
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6264680b1f08..3c65059a508a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3515,17 +3515,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	int xerr_idx = -1;
 	const struct xfrm_if_cb *ifcb;
 	struct sec_path *sp;
-	struct xfrm_if *xi;
 	u32 if_id = 0;
 
 	rcu_read_lock();
 	ifcb = xfrm_if_get_cb();
 
 	if (ifcb) {
-		xi = ifcb->decode_session(skb, family);
-		if (xi) {
-			if_id = xi->p.if_id;
-			net = xi->net;
+		struct xfrm_if_decode_session_result r;
+
+		if (ifcb->decode_session(skb, family, &r)) {
+			if_id = r.if_id;
+			net = r.net;
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 2c2493b9da9166478fe072e3054f8a5741dadb02 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 26 Aug 2022 14:47:00 +0300
Subject: xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in
 collect_md mode

Allow specifying the xfrm interface if_id and link as part of a route
metadata using the lwtunnel infrastructure.

This allows for example using a single xfrm interface in collect_md
mode as the target of multiple routes each specifying a different if_id.

With the appropriate changes to iproute2, considering an xfrm device
ipsec1 in collect_md mode one can for example add a route specifying
an if_id like so:

ip route add <SUBNET> dev ipsec1 encap xfrm if_id 1

In which case traffic routed to the device via this route would use
if_id in the xfrm interface policy lookup.

Or in the context of vrf, one can also specify the "link" property:

ip route add <SUBNET> dev ipsec1 encap xfrm if_id 1 link_dev eth15

Note: LWT_XFRM_LINK uses NLA_U32 similar to IFLA_XFRM_LINK even though
internally "link" is signed. This is consistent with other _LINK
attributes in other devices as well as in bpf and should not have an
effect as device indexes can't be negative.

Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/dst_metadata.h    | 11 ++++++
 include/uapi/linux/lwtunnel.h | 10 +++++
 net/core/lwtunnel.c           |  1 +
 net/xfrm/xfrm_interface.c     | 85 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index e4b059908cc7..57f75960fa28 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -60,13 +60,24 @@ skb_tunnel_info(const struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt)
+{
+	return (struct xfrm_md_info *)lwt->data;
+}
+
 static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+	struct dst_entry *dst;
 
 	if (md_dst && md_dst->type == METADATA_XFRM)
 		return &md_dst->u.xfrm_info;
 
+	dst = skb_dst(skb);
+	if (dst && dst->lwtstate &&
+	    dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM)
+		return lwt_xfrm_info(dst->lwtstate);
+
 	return NULL;
 }
 
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 2e206919125c..229655ef792f 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -15,6 +15,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	LWTUNNEL_ENCAP_RPL,
 	LWTUNNEL_ENCAP_IOAM6,
+	LWTUNNEL_ENCAP_XFRM,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -111,4 +112,13 @@ enum {
 
 #define LWT_BPF_MAX_HEADROOM 256
 
+enum {
+	LWT_XFRM_UNSPEC,
+	LWT_XFRM_IF_ID,
+	LWT_XFRM_LINK,
+	__LWT_XFRM_MAX,
+};
+
+#define LWT_XFRM_MAX (__LWT_XFRM_MAX - 1)
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 9ccd64e8a666..6fac2f0ef074 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -50,6 +50,7 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "IOAM6";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
+	case LWTUNNEL_ENCAP_XFRM:
 	case LWTUNNEL_ENCAP_NONE:
 	case __LWTUNNEL_ENCAP_MAX:
 		/* should not have got here */
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index e9a355047468..5a67b120c4db 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -60,6 +60,88 @@ struct xfrmi_net {
 	struct xfrm_if __rcu *collect_md_xfrmi;
 };
 
+static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = {
+	[LWT_XFRM_IF_ID]	= NLA_POLICY_MIN(NLA_U32, 1),
+	[LWT_XFRM_LINK]		= NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+static void xfrmi_destroy_state(struct lwtunnel_state *lwt)
+{
+}
+
+static int xfrmi_build_state(struct net *net, struct nlattr *nla,
+			     unsigned int family, const void *cfg,
+			     struct lwtunnel_state **ts,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWT_XFRM_MAX + 1];
+	struct lwtunnel_state *new_state;
+	struct xfrm_md_info *info;
+	int ret;
+
+	ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_XFRM_IF_ID]) {
+		NL_SET_ERR_MSG(extack, "if_id must be set");
+		return -EINVAL;
+	}
+
+	new_state = lwtunnel_state_alloc(sizeof(*info));
+	if (!new_state) {
+		NL_SET_ERR_MSG(extack, "failed to create encap info");
+		return -ENOMEM;
+	}
+
+	new_state->type = LWTUNNEL_ENCAP_XFRM;
+
+	info = lwt_xfrm_info(new_state);
+
+	info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]);
+
+	if (tb[LWT_XFRM_LINK])
+		info->link = nla_get_u32(tb[LWT_XFRM_LINK]);
+
+	*ts = new_state;
+	return 0;
+}
+
+static int xfrmi_fill_encap_info(struct sk_buff *skb,
+				 struct lwtunnel_state *lwt)
+{
+	struct xfrm_md_info *info = lwt_xfrm_info(lwt);
+
+	if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) ||
+	    (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link)))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */
+		nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */
+}
+
+static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct xfrm_md_info *a_info = lwt_xfrm_info(a);
+	struct xfrm_md_info *b_info = lwt_xfrm_info(b);
+
+	return memcmp(a_info, b_info, sizeof(*a_info));
+}
+
+static const struct lwtunnel_encap_ops xfrmi_encap_ops = {
+	.build_state	= xfrmi_build_state,
+	.destroy_state	= xfrmi_destroy_state,
+	.fill_encap	= xfrmi_fill_encap_info,
+	.get_encap_size = xfrmi_encap_nlsize,
+	.cmp_encap	= xfrmi_encap_cmp,
+	.owner		= THIS_MODULE,
+};
+
 #define for_each_xfrmi_rcu(start, xi) \
 	for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
 
@@ -1080,6 +1162,8 @@ static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
+
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
@@ -1098,6 +1182,7 @@ pernet_dev_failed:
 static void __exit xfrmi_fini(void)
 {
 	xfrm_if_unregister_cb();
+	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
 	xfrmi6_fini();
-- 
cgit v1.2.3


From a41c4088cf431a9f2458f4e54fbbf3113a1bd063 Mon Sep 17 00:00:00 2001
From: Xavier Roumegue <xavier.roumegue@oss.nxp.com>
Date: Sat, 30 Jul 2022 17:48:39 +0200
Subject: media: v4l: uapi: Add user control base for DW100 controls

Add a control base for DW100 driver controls, and reserve 16 controls.

Signed-off-by: Xavier Roumegue <xavier.roumegue@oss.nxp.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/v4l2-controls.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 5f46bf4a570c..87fa476428ee 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -225,6 +225,12 @@ enum v4l2_colorfx {
  */
 #define V4L2_CID_USER_ISL7998X_BASE		(V4L2_CID_USER_BASE + 0x1180)
 
+/*
+ * The base for DW100 driver controls.
+ * We reserve 16 controls for this driver.
+ */
+#define V4L2_CID_USER_DW100_BASE		(V4L2_CID_USER_BASE + 0x1190)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From 9d5c3c06980510b27e8f7ff033a21120e42c9715 Mon Sep 17 00:00:00 2001
From: Xavier Roumegue <xavier.roumegue@oss.nxp.com>
Date: Sat, 30 Jul 2022 17:48:40 +0200
Subject: media: uapi: Add a control for DW100 driver

The DW100 driver gets the dewarping mapping as a binary blob from the
userspace application through a custom control.
The blob format is hardware specific so create a dedicated control for
this purpose.

Signed-off-by: Xavier Roumegue <xavier.roumegue@oss.nxp.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/drivers/dw100.rst | 15 +++++++++++++++
 include/uapi/linux/dw100.h                          | 14 ++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 include/uapi/linux/dw100.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/drivers/dw100.rst b/Documentation/userspace-api/media/drivers/dw100.rst
index 1ca6fa55f539..fceea6ece622 100644
--- a/Documentation/userspace-api/media/drivers/dw100.rst
+++ b/Documentation/userspace-api/media/drivers/dw100.rst
@@ -66,4 +66,19 @@ map.
 More details on the DW100 hardware operations can be found in
 *chapter 13.15 DeWarp* of IMX8MP_ reference manual.
 
+The Vivante DW100 m2m driver implements the following driver-specific control:
+
+``V4L2_CID_DW100_DEWARPING_16x16_VERTEX_MAP (__u32 array)``
+    Specifies to DW100 driver its dewarping map (aka LUT) blob as described in
+    *chapter 13.15.2.3 Dewarping Remap* of IMX8MP_ reference manual as an U32
+    dynamic array. The image is divided into many small 16x16 blocks. If the
+    width/height of the image is not divisible by 16, the size of the
+    rightmost/bottommost block is the remainder. The dewarping map only saves
+    the vertex coordinates of the block. The dewarping grid map is comprised of
+    vertex coordinates for x and y. Each x, y coordinate register uses 16 bits
+    (UQ12.4) to record the coordinate address, with the Y coordinate in the
+    upper bits and X in the lower bits. The driver modifies the dimensions of
+    this control when the sink format is changed, to reflect the new input
+    resolution.
+
 .. _IMX8MP: https://www.nxp.com/webapp/Download?colCode=IMX8MPRM
diff --git a/include/uapi/linux/dw100.h b/include/uapi/linux/dw100.h
new file mode 100644
index 000000000000..3356496edd6b
--- /dev/null
+++ b/include/uapi/linux/dw100.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/* Copyright 2022 NXP */
+
+#ifndef __UAPI_DW100_H__
+#define __UAPI_DW100_H__
+
+#include <linux/v4l2-controls.h>
+
+/*
+ * Check Documentation/userspace-api/media/drivers/dw100.rst for control details.
+ */
+#define V4L2_CID_DW100_DEWARPING_16x16_VERTEX_MAP (V4L2_CID_USER_DW100_BASE + 1)
+
+#endif
-- 
cgit v1.2.3


From 1c24bb3f8bec5805df4763a4327ce616fadfed65 Mon Sep 17 00:00:00 2001
From: Deborah Brouwer <deborah.brouwer@collabora.com>
Date: Thu, 11 Aug 2022 22:37:56 +0200
Subject: media: v4l2-ctrls: Fix typo in VP8 comment

The comment for the VP8 loop filter flags uses the partially wrong name
for the flags. Unlike the other VP8 flag names, the loop filter flag names
don't have "_FLAG" in them. Change the comment so that it matches the
actual flag definitions in the header.

Signed-off-by: Deborah Brouwer <deborah.brouwer@collabora.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/v4l2-controls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 87fa476428ee..b5e7d082b8ad 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1736,7 +1736,7 @@ struct v4l2_vp8_segment {
  * @sharpness_level: matches sharpness_level syntax element.
  * @level: matches loop_filter_level syntax element.
  * @padding: padding field. Should be zeroed by applications.
- * @flags: see V4L2_VP8_LF_FLAG_{}.
+ * @flags: see V4L2_VP8_LF_{}.
  *
  * This structure contains loop filter related parameters.
  * See the 'mb_lf_adjustments()' part of the frame header syntax,
-- 
cgit v1.2.3


From 17611d3fb4a11ec500c49cb952faf09e114a5a10 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 18 Aug 2022 09:51:40 +0200
Subject: media: videodev2.h: drop V4L2_CAP_ASYNCIO

The V4L2_CAP_ASYNCIO capability was never implemented (and in fact
it isn't clear what it was supposed to do in the first place).

Drop it from the capabilities list. Keep it in videodev2.h with the
other defines under ifndef __KERNEL__ for backwards compatibility.

This will free up a capability bit for other future uses. And having
an unused and undefined I/O method is just plain confusing.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/v4l/async.rst           | 9 ---------
 Documentation/userspace-api/media/v4l/dev-raw-vbi.rst     | 2 +-
 Documentation/userspace-api/media/v4l/dev-sdr.rst         | 2 +-
 Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst  | 2 +-
 Documentation/userspace-api/media/v4l/hist-v4l2.rst       | 2 +-
 Documentation/userspace-api/media/v4l/io.rst              | 4 +---
 Documentation/userspace-api/media/v4l/vidioc-querycap.rst | 3 ---
 include/uapi/linux/videodev2.h                            | 6 +++++-
 8 files changed, 10 insertions(+), 20 deletions(-)
 delete mode 100644 Documentation/userspace-api/media/v4l/async.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/async.rst b/Documentation/userspace-api/media/v4l/async.rst
deleted file mode 100644
index d6960ff5c382..000000000000
--- a/Documentation/userspace-api/media/v4l/async.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
-
-.. _async:
-
-****************
-Asynchronous I/O
-****************
-
-This method is not defined yet.
diff --git a/Documentation/userspace-api/media/v4l/dev-raw-vbi.rst b/Documentation/userspace-api/media/v4l/dev-raw-vbi.rst
index 58f97c3a7792..2bec20d87928 100644
--- a/Documentation/userspace-api/media/v4l/dev-raw-vbi.rst
+++ b/Documentation/userspace-api/media/v4l/dev-raw-vbi.rst
@@ -41,7 +41,7 @@ Devices supporting the raw VBI capturing or output API set the
 in the ``capabilities`` field of struct
 :c:type:`v4l2_capability` returned by the
 :ref:`VIDIOC_QUERYCAP` ioctl. At least one of the
-read/write, streaming or asynchronous I/O methods must be supported. VBI
+read/write or streaming I/O methods must be supported. VBI
 devices may or may not have a tuner or modulator.
 
 Supplemental Functions
diff --git a/Documentation/userspace-api/media/v4l/dev-sdr.rst b/Documentation/userspace-api/media/v4l/dev-sdr.rst
index 928884dfe09d..dfdeddbca41f 100644
--- a/Documentation/userspace-api/media/v4l/dev-sdr.rst
+++ b/Documentation/userspace-api/media/v4l/dev-sdr.rst
@@ -34,7 +34,7 @@ Devices supporting the SDR transmitter interface set the
 device has an Digital to Analog Converter (DAC), which is a mandatory
 element for the SDR transmitter.
 
-At least one of the read/write, streaming or asynchronous I/O methods
+At least one of the read/write or streaming I/O methods
 must be supported.
 
 
diff --git a/Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst b/Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst
index 97ec2b115c71..44415822c7c5 100644
--- a/Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst
+++ b/Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst
@@ -36,7 +36,7 @@ Devices supporting the sliced VBI capturing or output API set the
 respectively, in the ``capabilities`` field of struct
 :c:type:`v4l2_capability` returned by the
 :ref:`VIDIOC_QUERYCAP` ioctl. At least one of the
-read/write, streaming or asynchronous :ref:`I/O methods <io>` must be
+read/write or streaming :ref:`I/O methods <io>` must be
 supported. Sliced VBI devices may have a tuner or modulator.
 
 Supplemental Functions
diff --git a/Documentation/userspace-api/media/v4l/hist-v4l2.rst b/Documentation/userspace-api/media/v4l/hist-v4l2.rst
index 28a2750d5c8c..dbc04374dc22 100644
--- a/Documentation/userspace-api/media/v4l/hist-v4l2.rst
+++ b/Documentation/userspace-api/media/v4l/hist-v4l2.rst
@@ -316,7 +316,7 @@ This unnamed version was finally merged into Linux 2.5.46.
     There are new fields to identify the driver, a new RDS device
     function ``V4L2_CAP_RDS_CAPTURE``, the ``V4L2_CAP_AUDIO`` flag
     indicates if the device has any audio connectors, another I/O
-    capability ``V4L2_CAP_ASYNCIO`` can be flagged. In response to these
+    capability V4L2_CAP_ASYNCIO can be flagged. In response to these
     changes the ``type`` field became a bit set and was merged into the
     ``flags`` field. ``V4L2_FLAG_TUNER`` was renamed to
     ``V4L2_CAP_TUNER``, ``V4L2_CAP_VIDEO_OVERLAY`` replaced
diff --git a/Documentation/userspace-api/media/v4l/io.rst b/Documentation/userspace-api/media/v4l/io.rst
index ce0cece6f35f..4b1964df9d73 100644
--- a/Documentation/userspace-api/media/v4l/io.rst
+++ b/Documentation/userspace-api/media/v4l/io.rst
@@ -17,8 +17,7 @@ read or write will fail at any time.
 
 Other methods must be negotiated. To select the streaming I/O method
 with memory mapped or user buffers applications call the
-:ref:`VIDIOC_REQBUFS` ioctl. The asynchronous I/O
-method is not defined yet.
+:ref:`VIDIOC_REQBUFS` ioctl.
 
 Video overlay can be considered another I/O method, although the
 application does not directly receive the image data. It is selected by
@@ -46,6 +45,5 @@ The following sections describe the various I/O methods in more detail.
     mmap
     userp
     dmabuf
-    async
     buffer
     field-order
diff --git a/Documentation/userspace-api/media/v4l/vidioc-querycap.rst b/Documentation/userspace-api/media/v4l/vidioc-querycap.rst
index 63e23f6f95ee..6c57b8428356 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-querycap.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-querycap.rst
@@ -244,9 +244,6 @@ specification the ioctl returns an ``EINVAL`` error code.
       - 0x01000000
       - The device supports the :c:func:`read()` and/or
 	:c:func:`write()` I/O methods.
-    * - ``V4L2_CAP_ASYNCIO``
-      - 0x02000000
-      - The device supports the :ref:`asynchronous <async>` I/O methods.
     * - ``V4L2_CAP_STREAMING``
       - 0x04000000
       - The device supports the :ref:`streaming <mmap>` I/O method.
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index c415ce5b6829..86cae23cc446 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -502,7 +502,6 @@ struct v4l2_capability {
 #define V4L2_CAP_META_CAPTURE		0x00800000  /* Is a metadata capture device */
 
 #define V4L2_CAP_READWRITE              0x01000000  /* read/write systemcalls */
-#define V4L2_CAP_ASYNCIO                0x02000000  /* async I/O */
 #define V4L2_CAP_STREAMING              0x04000000  /* streaming I/O ioctls */
 #define V4L2_CAP_META_OUTPUT		0x08000000  /* Is a metadata output device */
 
@@ -2683,6 +2682,11 @@ struct v4l2_create_buffers {
 #ifndef __KERNEL__
 #define V4L2_PIX_FMT_HM12 V4L2_PIX_FMT_NV12_16L16
 #define V4L2_PIX_FMT_SUNXI_TILED_NV12 V4L2_PIX_FMT_NV12_32L32
+/*
+ * This capability was never implemented, anyone using this cap should drop it
+ * from their code.
+ */
+#define V4L2_CAP_ASYNCIO 0x02000000
 #endif
 
 #endif /* _UAPI__LINUX_VIDEODEV2_H */
-- 
cgit v1.2.3


From 690252f19f0e486abb8590b3a7a03d4e065d93d4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 25 Aug 2022 20:09:31 -0700
Subject: netlink: add support for ext_ack missing attributes

There is currently no way to report via extack in a structured way
that an attribute is missing. This leads to families resorting to
string messages.

Add a pair of attributes - @offset and @type for machine-readable
way of reporting missing attributes. The @offset points to the
nest which should have contained the attribute, @type is the
expected nla_type. The offset will be skipped if the attribute
is missing at the message level rather than inside a nest.

User space should be able to figure out which attribute enum
(AKA attribute space AKA attribute set) the nest pointed to by
@offset is using.

Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/userspace-api/netlink/intro.rst |  7 +++++--
 include/linux/netlink.h                       | 13 +++++++++++++
 include/uapi/linux/netlink.h                  |  6 ++++++
 net/netlink/af_netlink.c                      | 12 ++++++++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/netlink/intro.rst b/Documentation/userspace-api/netlink/intro.rst
index 94337f79e077..8f1220756412 100644
--- a/Documentation/userspace-api/netlink/intro.rst
+++ b/Documentation/userspace-api/netlink/intro.rst
@@ -359,8 +359,8 @@ compatibility this feature has to be explicitly enabled by setting
 the ``NETLINK_EXT_ACK`` setsockopt() to ``1``.
 
 Types of extended ack attributes are defined in enum nlmsgerr_attrs.
-The two most commonly used attributes are ``NLMSGERR_ATTR_MSG``
-and ``NLMSGERR_ATTR_OFFS``.
+The most commonly used attributes are ``NLMSGERR_ATTR_MSG``,
+``NLMSGERR_ATTR_OFFS`` and ``NLMSGERR_ATTR_MISS_*``.
 
 ``NLMSGERR_ATTR_MSG`` carries a message in English describing
 the encountered problem. These messages are far more detailed
@@ -368,6 +368,9 @@ than what can be expressed thru standard UNIX error codes.
 
 ``NLMSGERR_ATTR_OFFS`` points to the attribute which caused the problem.
 
+``NLMSGERR_ATTR_MISS_TYPE`` and ``NLMSGERR_ATTR_MISS_NEST``
+inform about a missing attribute.
+
 Extended ACKs can be reported on errors as well as in case of success.
 The latter should be treated as a warning.
 
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index bda1c385cffb..1619221c415c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -71,6 +71,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
  *	%NL_SET_ERR_MSG
  * @bad_attr: attribute with error
  * @policy: policy for a bad attribute
+ * @miss_type: attribute type which was missing
+ * @miss_nest: nest missing an attribute (%NULL if missing top level attr)
  * @cookie: cookie data to return to userspace (for success)
  * @cookie_len: actual cookie data length
  */
@@ -78,6 +80,8 @@ struct netlink_ext_ack {
 	const char *_msg;
 	const struct nlattr *bad_attr;
 	const struct nla_policy *policy;
+	const struct nlattr *miss_nest;
+	u16 miss_type;
 	u8 cookie[NETLINK_MAX_COOKIE_LEN];
 	u8 cookie_len;
 };
@@ -126,6 +130,15 @@ struct netlink_ext_ack {
 #define NL_SET_ERR_MSG_ATTR(extack, attr, msg)		\
 	NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)
 
+#define NL_SET_ERR_ATTR_MISS(extack, nest, type)  do {	\
+	struct netlink_ext_ack *__extack = (extack);	\
+							\
+	if (__extack) {					\
+		__extack->miss_nest = (nest);		\
+		__extack->miss_type = (type);		\
+	}						\
+} while (0)
+
 static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
 					    u64 cookie)
 {
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index e0ab261ceca2..e0689dbd2cde 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -140,6 +140,10 @@ struct nlmsgerr {
  *	be used - in the success case - to identify a created
  *	object or operation or similar (binary)
  * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute
+ * @NLMSGERR_ATTR_MISS_TYPE: type of a missing required attribute,
+ *	%NLMSGERR_ATTR_MISS_NEST will not be present if the attribute was
+ *	missing at the message level
+ * @NLMSGERR_ATTR_MISS_NEST: offset of the nest where attribute was missing
  * @__NLMSGERR_ATTR_MAX: number of attributes
  * @NLMSGERR_ATTR_MAX: highest attribute number
  */
@@ -149,6 +153,8 @@ enum nlmsgerr_attrs {
 	NLMSGERR_ATTR_OFFS,
 	NLMSGERR_ATTR_COOKIE,
 	NLMSGERR_ATTR_POLICY,
+	NLMSGERR_ATTR_MISS_TYPE,
+	NLMSGERR_ATTR_MISS_NEST,
 
 	__NLMSGERR_ATTR_MAX,
 	NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7eae157c1625..f89ba302ac6e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2423,6 +2423,10 @@ netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
 		tlvlen += nla_total_size(sizeof(u32));
 	if (extack->policy)
 		tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
+	if (extack->miss_type)
+		tlvlen += nla_total_size(sizeof(u32));
+	if (extack->miss_nest)
+		tlvlen += nla_total_size(sizeof(u32));
 
 	return tlvlen;
 }
@@ -2449,6 +2453,14 @@ netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb,
 	if (extack->policy)
 		netlink_policy_dump_write_attr(skb, extack->policy,
 					       NLMSGERR_ATTR_POLICY);
+	if (extack->miss_type)
+		WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
+				    extack->miss_type));
+	if (extack->miss_nest &&
+	    !WARN_ON((u8 *)extack->miss_nest < in_skb->data ||
+		     (u8 *)extack->miss_nest > in_skb->data + in_skb->len))
+		WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
+				    (u8 *)extack->miss_nest - (u8 *)nlh));
 }
 
 void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
-- 
cgit v1.2.3


From fce1c23f629173e0db78b79a74f2052044a00e65 Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Tue, 23 Aug 2022 10:39:47 +0300
Subject: net: virtio_net: fix notification coalescing comments

Fix wording in comments for the notifications coalescing feature.

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20220823073947.14774-1-alvaro.karsz@solid-run.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/virtio_net.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 29ced55514d4..6cb842ea8979 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -56,7 +56,7 @@
 #define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
-#define VIRTIO_NET_F_NOTF_COAL	53	/* Guest can handle notifications coalescing */
+#define VIRTIO_NET_F_NOTF_COAL	53	/* Device supports notifications coalescing */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
@@ -364,24 +364,24 @@ struct virtio_net_hash_config {
  */
 #define VIRTIO_NET_CTRL_NOTF_COAL		6
 /*
- * Set the tx-usecs/tx-max-packets patameters.
- * tx-usecs - Maximum number of usecs to delay a TX notification.
- * tx-max-packets - Maximum number of packets to send before a TX notification.
+ * Set the tx-usecs/tx-max-packets parameters.
  */
 struct virtio_net_ctrl_coal_tx {
+	/* Maximum number of packets to send before a TX notification */
 	__le32 tx_max_packets;
+	/* Maximum number of usecs to delay a TX notification */
 	__le32 tx_usecs;
 };
 
 #define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET		0
 
 /*
- * Set the rx-usecs/rx-max-packets patameters.
- * rx-usecs - Maximum number of usecs to delay a RX notification.
- * rx-max-frames - Maximum number of packets to receive before a RX notification.
+ * Set the rx-usecs/rx-max-packets parameters.
  */
 struct virtio_net_ctrl_coal_rx {
+	/* Maximum number of packets to receive before a RX notification */
 	__le32 rx_max_packets;
+	/* Maximum number of usecs to delay a RX notification */
 	__le32 rx_usecs;
 };
 
-- 
cgit v1.2.3


From 8f36b3b4e1b58dca7d05e1579019230437e55d43 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Tue, 23 Aug 2022 18:24:56 -0600
Subject: usbip: add USBIP_URB_* URB transfer flags

USBIP driver packs URB transfer flags in network packets that are
exchanged between Server (usbip_host) and Client (vhci_hcd).

URB_* flags are internal to kernel and could change. Where as USBIP
URB flags exchanged in network packets are USBIP user API must not
change.

Add USBIP_URB* flags to make this an explicit API and change the
client and server to map them. Details as follows:

Client tx path (USBIP_CMD_SUBMIT):
- Maps URB_* to USBIP_URB_* when it sends USBIP_CMD_SUBMIT packet.

Server rx path (USBIP_CMD_SUBMIT):
- Maps USBIP_URB_* to URB_* when it receives USBIP_CMD_SUBMIT packet.

Flags aren't included in USBIP_CMD_UNLINK and USBIP_RET_SUBMIT packets
and no special handling is needed for them in the following cases:

- Server rx path (USBIP_CMD_UNLINK)
- Client rx path & Server tx path (USBIP_RET_SUBMIT)

Update protocol documentation to reflect the change.

Suggested-by: Hongren Zenithal Zheng <i@zenithal.me>
Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220824002456.94605-1-skhan@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/usb/usbip_protocol.rst | 13 +++---
 drivers/usb/usbip/stub_rx.c          |  4 +-
 drivers/usb/usbip/usbip_common.c     | 91 ++++++++++++++++++++++++++++++++++--
 include/uapi/linux/usbip.h           | 26 +++++++++++
 4 files changed, 122 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/usb/usbip_protocol.rst b/Documentation/usb/usbip_protocol.rst
index 0b8541fda4d8..adc158967cc6 100644
--- a/Documentation/usb/usbip_protocol.rst
+++ b/Documentation/usb/usbip_protocol.rst
@@ -340,13 +340,12 @@ USBIP_CMD_SUBMIT:
 | 0         | 20     | usbip_header_basic, 'command' shall be 0x00000001 |
 +-----------+--------+---------------------------------------------------+
 | 0x14      | 4      | transfer_flags: possible values depend on the     |
-|           |        | URB transfer_flags (refer to URB doc in           |
-|           |        | Documentation/driver-api/usb/URB.rst)             |
-|           |        | but with URB_NO_TRANSFER_DMA_MAP masked. Refer to |
-|           |        | function usbip_pack_cmd_submit and function       |
-|           |        | tweak_transfer_flags in drivers/usb/usbip/        |
-|           |        | usbip_common.c. The following fields may also ref |
-|           |        | to function usbip_pack_cmd_submit and URB doc     |
+|           |        | USBIP_URB transfer_flags.                         |
+|           |        | Refer to include/uapi/linux/usbip.h and           |
+|           |        | Documentation/driver-api/usb/URB.rst.             |
+|           |        | Refer to usbip_pack_cmd_submit() and              |
+|           |        | tweak_transfer_flags() in drivers/usb/usbip/      |
+|           |        | usbip_common.c.                                   |
 +-----------+--------+---------------------------------------------------+
 | 0x18      | 4      | transfer_buffer_length:                           |
 |           |        | use URB transfer_buffer_length                    |
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 5dd41e8215e0..fc01b31bbb87 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -464,7 +464,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev,
 	int nents;
 	int num_urbs = 1;
 	int pipe = get_pipe(sdev, pdu);
-	int use_sg = pdu->u.cmd_submit.transfer_flags & URB_DMA_MAP_SG;
+	int use_sg = pdu->u.cmd_submit.transfer_flags & USBIP_URB_DMA_MAP_SG;
 	int support_sg = 1;
 	int np = 0;
 	int ret, i;
@@ -514,7 +514,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev,
 				num_urbs = nents;
 				priv->completed_urbs = 0;
 				pdu->u.cmd_submit.transfer_flags &=
-								~URB_DMA_MAP_SG;
+								~USBIP_URB_DMA_MAP_SG;
 			}
 		} else {
 			buffer = kzalloc(buf_len, GFP_KERNEL);
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index 2ab99244bc31..053a2bca4c47 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -344,6 +344,91 @@ static unsigned int tweak_transfer_flags(unsigned int flags)
 	return flags;
 }
 
+/*
+ * USBIP driver packs URB transfer flags in PDUs that are exchanged
+ * between Server (usbip_host) and Client (vhci_hcd). URB_* flags
+ * are internal to kernel and could change. Where as USBIP URB flags
+ * exchanged in PDUs are USBIP user API must not change.
+ *
+ * USBIP_URB* flags are exported as explicit API and client and server
+ * do mapping from kernel flags to USBIP_URB*. Details as follows:
+ *
+ * Client tx path (USBIP_CMD_SUBMIT):
+ * - Maps URB_* to USBIP_URB_* when it sends USBIP_CMD_SUBMIT packet.
+ *
+ * Server rx path (USBIP_CMD_SUBMIT):
+ * - Maps USBIP_URB_* to URB_* when it receives USBIP_CMD_SUBMIT packet.
+ *
+ * Flags aren't included in USBIP_CMD_UNLINK and USBIP_RET_SUBMIT packets
+ * and no special handling is needed for them in the following cases:
+ * - Server rx path (USBIP_CMD_UNLINK)
+ * - Client rx path & Server tx path (USBIP_RET_SUBMIT)
+ *
+ * Code paths:
+ * usbip_pack_pdu() is the common routine that handles packing pdu from
+ * urb and unpack pdu to an urb.
+ *
+ * usbip_pack_cmd_submit() and usbip_pack_ret_submit() handle
+ * USBIP_CMD_SUBMIT and USBIP_RET_SUBMIT respectively.
+ *
+ * usbip_map_urb_to_usbip() and usbip_map_usbip_to_urb() are used
+ * by usbip_pack_cmd_submit() and usbip_pack_ret_submit() to map
+ * flags.
+ */
+
+struct urb_to_usbip_flags {
+	u32 urb_flag;
+	u32 usbip_flag;
+};
+
+#define NUM_USBIP_FLAGS	17
+
+static const struct urb_to_usbip_flags flag_map[NUM_USBIP_FLAGS] = {
+	{URB_SHORT_NOT_OK, USBIP_URB_SHORT_NOT_OK},
+	{URB_ISO_ASAP, USBIP_URB_ISO_ASAP},
+	{URB_NO_TRANSFER_DMA_MAP, USBIP_URB_NO_TRANSFER_DMA_MAP},
+	{URB_ZERO_PACKET, USBIP_URB_ZERO_PACKET},
+	{URB_NO_INTERRUPT, USBIP_URB_NO_INTERRUPT},
+	{URB_FREE_BUFFER, USBIP_URB_FREE_BUFFER},
+	{URB_DIR_IN, USBIP_URB_DIR_IN},
+	{URB_DIR_OUT, USBIP_URB_DIR_OUT},
+	{URB_DIR_MASK, USBIP_URB_DIR_MASK},
+	{URB_DMA_MAP_SINGLE, USBIP_URB_DMA_MAP_SINGLE},
+	{URB_DMA_MAP_PAGE, USBIP_URB_DMA_MAP_PAGE},
+	{URB_DMA_MAP_SG, USBIP_URB_DMA_MAP_SG},
+	{URB_MAP_LOCAL, USBIP_URB_MAP_LOCAL},
+	{URB_SETUP_MAP_SINGLE, USBIP_URB_SETUP_MAP_SINGLE},
+	{URB_SETUP_MAP_LOCAL, USBIP_URB_SETUP_MAP_LOCAL},
+	{URB_DMA_SG_COMBINED, USBIP_URB_DMA_SG_COMBINED},
+	{URB_ALIGNED_TEMP_BUFFER, USBIP_URB_ALIGNED_TEMP_BUFFER},
+};
+
+static unsigned int urb_to_usbip(unsigned int flags)
+{
+	unsigned int map_flags = 0;
+	int loop;
+
+	for (loop = 0; loop < NUM_USBIP_FLAGS; loop++) {
+		if (flags & flag_map[loop].urb_flag)
+			map_flags |= flag_map[loop].usbip_flag;
+	}
+
+	return map_flags;
+}
+
+static unsigned int usbip_to_urb(unsigned int flags)
+{
+	unsigned int map_flags = 0;
+	int loop;
+
+	for (loop = 0; loop < NUM_USBIP_FLAGS; loop++) {
+		if (flags & flag_map[loop].usbip_flag)
+			map_flags |= flag_map[loop].urb_flag;
+	}
+
+	return map_flags;
+}
+
 static void usbip_pack_cmd_submit(struct usbip_header *pdu, struct urb *urb,
 				  int pack)
 {
@@ -354,14 +439,14 @@ static void usbip_pack_cmd_submit(struct usbip_header *pdu, struct urb *urb,
 	 * will be discussed when usbip is ported to other operating systems.
 	 */
 	if (pack) {
-		spdu->transfer_flags =
-			tweak_transfer_flags(urb->transfer_flags);
+		/* map after tweaking the urb flags */
+		spdu->transfer_flags = urb_to_usbip(tweak_transfer_flags(urb->transfer_flags));
 		spdu->transfer_buffer_length	= urb->transfer_buffer_length;
 		spdu->start_frame		= urb->start_frame;
 		spdu->number_of_packets		= urb->number_of_packets;
 		spdu->interval			= urb->interval;
 	} else  {
-		urb->transfer_flags         = spdu->transfer_flags;
+		urb->transfer_flags         = usbip_to_urb(spdu->transfer_flags);
 		urb->transfer_buffer_length = spdu->transfer_buffer_length;
 		urb->start_frame            = spdu->start_frame;
 		urb->number_of_packets      = spdu->number_of_packets;
diff --git a/include/uapi/linux/usbip.h b/include/uapi/linux/usbip.h
index fd393d908d8a..e4421ad55b2e 100644
--- a/include/uapi/linux/usbip.h
+++ b/include/uapi/linux/usbip.h
@@ -24,4 +24,30 @@ enum usbip_device_status {
 	VDEV_ST_USED,
 	VDEV_ST_ERROR
 };
+
+/* USB URB Transfer flags:
+ *
+ * USBIP server and client (vchi) pack URBs in TCP packets. The following
+ * are the transfer type defines used in USBIP protocol.
+ */
+
+#define USBIP_URB_SHORT_NOT_OK		0x0001
+#define USBIP_URB_ISO_ASAP		0x0002
+#define USBIP_URB_NO_TRANSFER_DMA_MAP	0x0004
+#define USBIP_URB_ZERO_PACKET		0x0040
+#define USBIP_URB_NO_INTERRUPT		0x0080
+#define USBIP_URB_FREE_BUFFER		0x0100
+#define USBIP_URB_DIR_IN		0x0200
+#define USBIP_URB_DIR_OUT		0
+#define USBIP_URB_DIR_MASK		USBIP_URB_DIR_IN
+
+#define USBIP_URB_DMA_MAP_SINGLE	0x00010000
+#define USBIP_URB_DMA_MAP_PAGE		0x00020000
+#define USBIP_URB_DMA_MAP_SG		0x00040000
+#define USBIP_URB_MAP_LOCAL		0x00080000
+#define USBIP_URB_SETUP_MAP_SINGLE	0x00100000
+#define USBIP_URB_SETUP_MAP_LOCAL	0x00200000
+#define USBIP_URB_DMA_SG_COMBINED	0x00400000
+#define USBIP_URB_ALIGNED_TEMP_BUFFER	0x00800000
+
 #endif /* _UAPI_LINUX_USBIP_H */
-- 
cgit v1.2.3


From 23c12d5fc02fb0712c64f3e87a27fcfa78e8af9c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Sep 2022 11:54:01 +0100
Subject: Revert "io_uring: add zc notification flush requests"

This reverts commit 492dddb4f6e3a5839c27d41ff1fecdbe6c3ab851.

Soon we won't have the very notion of notification flushing, so remove
notification flushing requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8850334ca56e65b413cb34fd158db81d7b2865a3.1662027856.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 -
 io_uring/rsrc.c               | 38 --------------------------------------
 2 files changed, 39 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 9e0b5c8d92ce..18ae5caf1773 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -301,7 +301,6 @@ enum io_uring_op {
  */
 enum {
 	IORING_RSRC_UPDATE_FILES,
-	IORING_RSRC_UPDATE_NOTIF,
 };
 
 /*
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 71359a4d0bd4..048f7483fe8a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -15,7 +15,6 @@
 #include "io_uring.h"
 #include "openclose.h"
 #include "rsrc.h"
-#include "notif.h"
 
 struct io_rsrc_update {
 	struct file			*file;
@@ -741,41 +740,6 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
-	struct io_ring_ctx *ctx = req->ctx;
-	unsigned len = up->nr_args;
-	unsigned idx_end, idx = up->offset;
-	int ret = 0;
-
-	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely(check_add_overflow(idx, len, &idx_end))) {
-		ret = -EOVERFLOW;
-		goto out;
-	}
-	if (unlikely(idx_end > ctx->nr_notif_slots)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	for (; idx < idx_end; idx++) {
-		struct io_notif_slot *slot = &ctx->notif_slots[idx];
-
-		if (!slot->notif)
-			continue;
-		if (up->arg)
-			slot->tag = up->arg;
-		io_notif_slot_flush_submit(slot, issue_flags);
-	}
-out:
-	io_ring_submit_unlock(ctx, issue_flags);
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_set_res(req, ret, 0);
-	return IOU_OK;
-}
-
 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
@@ -783,8 +747,6 @@ int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 	switch (up->type) {
 	case IORING_RSRC_UPDATE_FILES:
 		return io_files_update(req, issue_flags);
-	case IORING_RSRC_UPDATE_NOTIF:
-		return io_notif_update(req, issue_flags);
 	}
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From d9808ceb3129b811becebdee3ec96d189c83e56c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Sep 2022 11:54:02 +0100
Subject: Revert "io_uring: rename IORING_OP_FILES_UPDATE"

This reverts commit 4379d5f15b3fd4224c37841029178aa8082a242e.

We removed notification flushing, also cleanup uapi preparation changes
to not pollute it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/89edc3905350f91e1b6e26d9dbf42ee44fd451a2.1662027856.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 12 +-----------
 io_uring/opdef.c              |  9 ++++-----
 io_uring/rsrc.c               | 17 ++---------------
 io_uring/rsrc.h               |  4 ++--
 4 files changed, 9 insertions(+), 33 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 18ae5caf1773..111b651366bd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -178,8 +178,7 @@ enum io_uring_op {
 	IORING_OP_FALLOCATE,
 	IORING_OP_OPENAT,
 	IORING_OP_CLOSE,
-	IORING_OP_RSRC_UPDATE,
-	IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
+	IORING_OP_FILES_UPDATE,
 	IORING_OP_STATX,
 	IORING_OP_READ,
 	IORING_OP_WRITE,
@@ -228,7 +227,6 @@ enum io_uring_op {
 #define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
 #define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
 #define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
-
 /*
  * sqe->splice_flags
  * extends splice(2) flags
@@ -295,14 +293,6 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 
-
-/*
- * IORING_OP_RSRC_UPDATE flags
- */
-enum {
-	IORING_RSRC_UPDATE_FILES,
-};
-
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
  */
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 41410126c1c6..10b301ccf5cd 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -246,13 +246,12 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_close_prep,
 		.issue			= io_close,
 	},
-	[IORING_OP_RSRC_UPDATE] = {
+	[IORING_OP_FILES_UPDATE] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "RSRC_UPDATE",
-		.prep			= io_rsrc_update_prep,
-		.issue			= io_rsrc_update,
-		.ioprio			= 1,
+		.name			= "FILES_UPDATE",
+		.prep			= io_files_update_prep,
+		.issue			= io_files_update,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 048f7483fe8a..cf3272113214 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -21,7 +21,6 @@ struct io_rsrc_update {
 	u64				arg;
 	u32				nr_args;
 	u32				offset;
-	int				type;
 };
 
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
@@ -654,7 +653,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	return -EINVAL;
 }
 
-int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 
@@ -668,7 +667,6 @@ int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!up->nr_args)
 		return -EINVAL;
 	up->arg = READ_ONCE(sqe->addr);
-	up->type = READ_ONCE(sqe->ioprio);
 	return 0;
 }
 
@@ -711,7 +709,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	return ret;
 }
 
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	struct io_ring_ctx *ctx = req->ctx;
@@ -740,17 +738,6 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
-{
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
-
-	switch (up->type) {
-	case IORING_RSRC_UPDATE_FILES:
-		return io_files_update(req, issue_flags);
-	}
-	return -EINVAL;
-}
-
 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 			  struct io_rsrc_node *node, void *rsrc)
 {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..9bce15665444 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -167,8 +167,8 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 	return &data->tags[table_idx][off];
 }
 
-int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags);
-int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
+int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 
 int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
 
-- 
cgit v1.2.3


From 57f332246afa5929bdf2e7a5facddedb43549be4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Sep 2022 11:54:03 +0100
Subject: io_uring/notif: remove notif registration

We're going to remove the userspace exposed zerocopy notification API,
remove notification registration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6ff00b97be99869c386958a990593c9c31cf105b.1662027856.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  8 -----
 io_uring/io_uring.c           | 10 ------
 io_uring/net.c                |  4 +--
 io_uring/notif.c              | 71 -------------------------------------------
 io_uring/notif.h              | 11 -------
 5 files changed, 1 insertion(+), 103 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 111b651366bd..b11c57b0ebb5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -279,14 +279,10 @@ enum io_uring_op {
  *
  * IORING_RECVSEND_FIXED_BUF	Use registered buffers, the index is stored in
  *				the buf_index field.
- *
- * IORING_RECVSEND_NOTIF_FLUSH	Flush a notification after a successful
- *				successful. Only for zerocopy sends.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
-#define IORING_RECVSEND_NOTIF_FLUSH	(1U << 3)
 
 /*
  * accept flags stored in sqe->ioprio
@@ -474,10 +470,6 @@ enum {
 	/* register a range of fixed file slots for automatic slot allocation */
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
-	/* zerocopy notification API */
-	IORING_REGISTER_NOTIFIERS		= 26,
-	IORING_UNREGISTER_NOTIFIERS		= 27,
-
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 77616279000b..c2e06a3aa18d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2640,7 +2640,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_unregister_personality(ctx, index);
 	if (ctx->rings)
 		io_poll_remove_all(ctx, NULL, true);
-	io_notif_unregister(ctx);
 	mutex_unlock(&ctx->uring_lock);
 
 	/* failed during ring init, it couldn't have issued any requests */
@@ -3839,15 +3838,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_file_alloc_range(ctx, arg);
 		break;
-	case IORING_REGISTER_NOTIFIERS:
-		ret = io_notif_register(ctx, arg, nr_args);
-		break;
-	case IORING_UNREGISTER_NOTIFIERS:
-		ret = -EINVAL;
-		if (arg || nr_args)
-			break;
-		ret = io_notif_unregister(ctx);
-		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 7a5468cc905e..aac6997b7d88 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -889,7 +889,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	zc->flags = READ_ONCE(sqe->ioprio);
 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
-			  IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
+			  IORING_RECVSEND_FIXED_BUF))
 		return -EINVAL;
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		unsigned idx = READ_ONCE(sqe->buf_index);
@@ -1063,8 +1063,6 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		req_set_fail(req);
-	} else if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) {
-		io_notif_slot_flush_submit(notif_slot, 0);
 	}
 
 	if (ret >= 0)
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 96f076b175e0..11f45640684a 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -86,74 +86,3 @@ void io_notif_slot_flush(struct io_notif_slot *slot)
 		io_req_task_work_add(notif);
 	}
 }
-
-__cold int io_notif_unregister(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	int i;
-
-	if (!ctx->notif_slots)
-		return -ENXIO;
-
-	for (i = 0; i < ctx->nr_notif_slots; i++) {
-		struct io_notif_slot *slot = &ctx->notif_slots[i];
-		struct io_kiocb *notif = slot->notif;
-		struct io_notif_data *nd;
-
-		if (!notif)
-			continue;
-		nd = io_notif_to_data(notif);
-		slot->notif = NULL;
-		if (!refcount_dec_and_test(&nd->uarg.refcnt))
-			continue;
-		notif->io_task_work.func = __io_notif_complete_tw;
-		io_req_task_work_add(notif);
-	}
-
-	kvfree(ctx->notif_slots);
-	ctx->notif_slots = NULL;
-	ctx->nr_notif_slots = 0;
-	return 0;
-}
-
-__cold int io_notif_register(struct io_ring_ctx *ctx,
-			     void __user *arg, unsigned int size)
-	__must_hold(&ctx->uring_lock)
-{
-	struct io_uring_notification_slot __user *slots;
-	struct io_uring_notification_slot slot;
-	struct io_uring_notification_register reg;
-	unsigned i;
-
-	if (ctx->nr_notif_slots)
-		return -EBUSY;
-	if (size != sizeof(reg))
-		return -EINVAL;
-	if (copy_from_user(&reg, arg, sizeof(reg)))
-		return -EFAULT;
-	if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
-		return -EINVAL;
-	if (reg.resv || reg.resv2 || reg.resv3)
-		return -EINVAL;
-
-	slots = u64_to_user_ptr(reg.data);
-	ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
-				GFP_KERNEL_ACCOUNT);
-	if (!ctx->notif_slots)
-		return -ENOMEM;
-
-	for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
-		struct io_notif_slot *notif_slot = &ctx->notif_slots[i];
-
-		if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
-			io_notif_unregister(ctx);
-			return -EFAULT;
-		}
-		if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
-			io_notif_unregister(ctx);
-			return -EINVAL;
-		}
-		notif_slot->tag = slot.tag;
-	}
-	return 0;
-}
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 80f6445e0c2b..8380eeff2f2e 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -8,7 +8,6 @@
 #include "rsrc.h"
 
 #define IO_NOTIF_SPLICE_BATCH	32
-#define IORING_MAX_NOTIF_SLOTS	(1U << 15)
 
 struct io_notif_data {
 	struct file		*file;
@@ -36,10 +35,6 @@ struct io_notif_slot {
 	u32			seq;
 };
 
-int io_notif_register(struct io_ring_ctx *ctx,
-		      void __user *arg, unsigned int size);
-int io_notif_unregister(struct io_ring_ctx *ctx);
-
 void io_notif_slot_flush(struct io_notif_slot *slot);
 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 				struct io_notif_slot *slot);
@@ -67,12 +62,6 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
 	return &ctx->notif_slots[idx];
 }
 
-static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
-					      unsigned int issue_flags)
-{
-	io_notif_slot_flush(slot);
-}
-
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 {
 	struct io_ring_ctx *ctx = notif->ctx;
-- 
cgit v1.2.3


From b48c312be05e83b55a4d58bf61f80b4a3288fb7e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 1 Sep 2022 11:54:04 +0100
Subject: io_uring/net: simplify zerocopy send user API

Following user feedback, this patch simplifies zerocopy send API. One of
the main complaints is that the current API is difficult with the
userspace managing notification slots, and then send retries with error
handling make it even worse.

Instead of keeping notification slots change it to the per-request
notifications model, which posts both completion and notification CQEs
for each request when any data has been sent, and only one CQE if it
fails. All notification CQEs will have IORING_CQE_F_NOTIF set and
IORING_CQE_F_MORE in completion CQEs indicates whether to wait a
notification or not.

IOSQE_CQE_SKIP_SUCCESS is disallowed with zerocopy sends for now.

This is less flexible, but greatly simplifies the user API and also the
kernel implementation. We reuse notif helpers in this patch, but in the
future there won't be need for keeping two requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/95287640ab98fc9417370afb16e310677c63e6ce.1662027856.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  7 ++++--
 io_uring/io_uring.c           |  4 ++--
 io_uring/net.c                | 53 +++++++++++++++++++++++++++----------------
 io_uring/net.h                |  1 +
 io_uring/notif.c              | 12 ++--------
 io_uring/notif.h              | 43 ++---------------------------------
 io_uring/opdef.c              |  3 ++-
 7 files changed, 47 insertions(+), 76 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b11c57b0ebb5..6b83177fd41d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -71,8 +71,8 @@ struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 		struct {
-			__u16	notification_idx;
 			__u16	addr_len;
+			__u16	__pad3[1];
 		};
 	};
 	union {
@@ -205,7 +205,7 @@ enum io_uring_op {
 	IORING_OP_GETXATTR,
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
-	IORING_OP_SENDZC_NOTIF,
+	IORING_OP_SEND_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -326,10 +326,13 @@ struct io_uring_cqe {
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
  * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
  * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
+ * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
+ * 			them from sends.
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
 #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
+#define IORING_CQE_F_NOTIF		(1U << 3)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c2e06a3aa18d..f9be9b7eb654 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3923,8 +3923,8 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
-	BUILD_BUG_SQE_ELEM(44, __u16,  notification_idx);
-	BUILD_BUG_SQE_ELEM(46, __u16,  addr_len);
+	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
+	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
diff --git a/io_uring/net.c b/io_uring/net.c
index aac6997b7d88..7047c1342541 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -65,12 +65,12 @@ struct io_sendzc {
 	struct file			*file;
 	void __user			*buf;
 	size_t				len;
-	u16				slot_idx;
 	unsigned			msg_flags;
 	unsigned			flags;
 	unsigned			addr_len;
 	void __user			*addr;
 	size_t				done_io;
+	struct io_kiocb 		*notif;
 };
 
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
@@ -879,12 +879,26 @@ out_free:
 	return ret;
 }
 
+void io_sendzc_cleanup(struct io_kiocb *req)
+{
+	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
+
+	zc->notif->flags |= REQ_F_CQE_SKIP;
+	io_notif_flush(zc->notif);
+	zc->notif = NULL;
+}
+
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *notif;
 
-	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
+	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
+	    READ_ONCE(sqe->__pad3[0]))
+		return -EINVAL;
+	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
+	if (req->flags & REQ_F_CQE_SKIP)
 		return -EINVAL;
 
 	zc->flags = READ_ONCE(sqe->ioprio);
@@ -900,11 +914,17 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
 		io_req_set_rsrc_node(req, ctx, 0);
 	}
+	notif = zc->notif = io_alloc_notif(ctx);
+	if (!notif)
+		return -ENOMEM;
+	notif->cqe.user_data = req->cqe.user_data;
+	notif->cqe.res = 0;
+	notif->cqe.flags = IORING_CQE_F_NOTIF;
+	req->flags |= REQ_F_NEED_CLEANUP;
 
 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	zc->len = READ_ONCE(sqe->len);
 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
-	zc->slot_idx = READ_ONCE(sqe->notification_idx);
 	if (zc->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 
@@ -976,33 +996,20 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct sockaddr_storage __address, *addr = NULL;
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
-	struct io_notif_slot *notif_slot;
-	struct io_kiocb *notif;
 	struct msghdr msg;
 	struct iovec iov;
 	struct socket *sock;
-	unsigned msg_flags;
+	unsigned msg_flags, cflags;
 	int ret, min_ret = 0;
 
 	if (!(req->flags & REQ_F_POLLED) &&
 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
 		return -EAGAIN;
-
-	if (issue_flags & IO_URING_F_UNLOCKED)
-		return -EAGAIN;
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
-	if (!notif_slot)
-		return -EINVAL;
-	notif = io_get_notif(ctx, notif_slot);
-	if (!notif)
-		return -ENOMEM;
-
 	msg.msg_name = NULL;
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
@@ -1033,7 +1040,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 					  &msg.msg_iter);
 		if (unlikely(ret))
 			return ret;
-		ret = io_notif_account_mem(notif, zc->len);
+		ret = io_notif_account_mem(zc->notif, zc->len);
 		if (unlikely(ret))
 			return ret;
 	}
@@ -1045,7 +1052,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		min_ret = iov_iter_count(&msg.msg_iter);
 
 	msg.msg_flags = msg_flags;
-	msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
+	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
 	msg.sg_from_iter = io_sg_from_iter;
 	ret = sock_sendmsg(sock, &msg);
 
@@ -1060,6 +1067,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 			req->flags |= REQ_F_PARTIAL_IO;
 			return io_setup_async_addr(req, addr, issue_flags);
 		}
+		if (ret < 0 && !zc->done_io)
+			zc->notif->flags |= REQ_F_CQE_SKIP;
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
 		req_set_fail(req);
@@ -1069,7 +1078,11 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 		ret += zc->done_io;
 	else if (zc->done_io)
 		ret = zc->done_io;
-	io_req_set_res(req, ret, 0);
+
+	io_notif_flush(zc->notif);
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
+	io_req_set_res(req, ret, cflags);
 	return IOU_OK;
 }
 
diff --git a/io_uring/net.h b/io_uring/net.h
index f91f56c6eeac..d744a0a874e7 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -55,6 +55,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+void io_sendzc_cleanup(struct io_kiocb *req);
 
 void io_netmsg_cache_free(struct io_cache_entry *entry);
 #else
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 11f45640684a..38d77165edc3 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -42,8 +42,7 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
 	}
 }
 
-struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
-				struct io_notif_slot *slot)
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
 	struct io_kiocb *notif;
@@ -59,27 +58,20 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 	io_get_task_refs(1);
 	notif->rsrc_node = NULL;
 	io_req_set_rsrc_node(notif, ctx, 0);
-	notif->cqe.user_data = slot->tag;
-	notif->cqe.flags = slot->seq++;
-	notif->cqe.res = 0;
 
 	nd = io_notif_to_data(notif);
 	nd->account_pages = 0;
 	nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
 	nd->uarg.callback = io_uring_tx_zerocopy_callback;
-	/* master ref owned by io_notif_slot, will be dropped on flush */
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
 
-void io_notif_slot_flush(struct io_notif_slot *slot)
+void io_notif_flush(struct io_kiocb *notif)
 	__must_hold(&slot->notif->ctx->uring_lock)
 {
-	struct io_kiocb *notif = slot->notif;
 	struct io_notif_data *nd = io_notif_to_data(notif);
 
-	slot->notif = NULL;
-
 	/* drop slot's master ref */
 	if (refcount_dec_and_test(&nd->uarg.refcnt)) {
 		notif->io_task_work.func = __io_notif_complete_tw;
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 8380eeff2f2e..5b4d710c8ca5 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -15,53 +15,14 @@ struct io_notif_data {
 	unsigned long		account_pages;
 };
 
-struct io_notif_slot {
-	/*
-	 * Current/active notifier. A slot holds only one active notifier at a
-	 * time and keeps one reference to it. Flush releases the reference and
-	 * lazily replaces it with a new notifier.
-	 */
-	struct io_kiocb		*notif;
-
-	/*
-	 * Default ->user_data for this slot notifiers CQEs
-	 */
-	u64			tag;
-	/*
-	 * Notifiers of a slot live in generations, we create a new notifier
-	 * only after flushing the previous one. Track the sequential number
-	 * for all notifiers and copy it into notifiers's cqe->cflags
-	 */
-	u32			seq;
-};
-
-void io_notif_slot_flush(struct io_notif_slot *slot);
-struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
-				struct io_notif_slot *slot);
+void io_notif_flush(struct io_kiocb *notif);
+struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
 
 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
 {
 	return io_kiocb_to_cmd(notif, struct io_notif_data);
 }
 
-static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
-					    struct io_notif_slot *slot)
-{
-	if (!slot->notif)
-		slot->notif = io_alloc_notif(ctx, slot);
-	return slot->notif;
-}
-
-static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
-						      unsigned idx)
-	__must_hold(&ctx->uring_lock)
-{
-	if (idx >= ctx->nr_notif_slots)
-		return NULL;
-	idx = array_index_nospec(idx, ctx->nr_notif_slots);
-	return &ctx->notif_slots[idx];
-}
-
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 {
 	struct io_ring_ctx *ctx = notif->ctx;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 10b301ccf5cd..c61494e0a602 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -470,7 +470,7 @@ const struct io_op_def io_op_defs[] = {
 		.issue			= io_uring_cmd,
 		.prep_async		= io_uring_cmd_prep_async,
 	},
-	[IORING_OP_SENDZC_NOTIF] = {
+	[IORING_OP_SEND_ZC] = {
 		.name			= "SENDZC_NOTIF",
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -483,6 +483,7 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_sendzc_prep,
 		.issue			= io_sendzc,
 		.prep_async		= io_sendzc_prep_async,
+		.cleanup		= io_sendzc_cleanup,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
-- 
cgit v1.2.3


From 385ecfdfb5d5ad0ff37e20381c70e18af8cf1bdb Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <abhsahu@nvidia.com>
Date: Mon, 29 Aug 2022 17:18:46 +0530
Subject: vfio: Add the device features for the low power entry and exit

This patch adds the following new device features for the low
power entry and exit in the header file. The implementation for the
same will be added in the subsequent patches.

- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT

For vfio-pci based devices, with the standard PCI PM registers,
all power states cannot be achieved. The platform-based power management
needs to be involved to go into the lowest power state. For doing low
power entry and exit with platform-based power management,
these device features can be used.

The entry device feature has two variants. These two variants are mainly
to support the different behaviour for the low power entry.
If there is any access for the VFIO device on the host side, then the
device will be moved out of the low power state without the user's
guest driver involvement. Some devices (for example NVIDIA VGA or
3D controller) require the user's guest driver involvement for
each low-power entry. In the first variant, the host can return the
device to low power automatically. The device will continue to
attempt to reach low power until the low power exit feature is called.
In the second variant, if the device exits low power due to an access,
the host kernel will signal the user via the provided eventfd and will
not return the device to low power without a subsequent call to one of
the low power entry features. A call to the low power exit feature is
optional if the user provided eventfd is signaled.

These device features only support VFIO_DEVICE_FEATURE_SET and
VFIO_DEVICE_FEATURE_PROBE operations.

Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-2-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 733a1cddde30..76a173f973de 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -986,6 +986,62 @@ enum vfio_device_mig_state {
 	VFIO_DEVICE_STATE_RUNNING_P2P = 5,
 };
 
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
+ * state with the platform-based power management.  Device use of lower power
+ * states depends on factors managed by the runtime power management core,
+ * including system level support and coordinating support among dependent
+ * devices.  Enabling device low power entry does not guarantee lower power
+ * usage by the device, nor is a mechanism provided through this feature to
+ * know the current power state of the device.  If any device access happens
+ * (either from the host or through the vfio uAPI) when the device is in the
+ * low power state, then the host will move the device out of the low power
+ * state as necessary prior to the access.  Once the access is completed, the
+ * device may re-enter the low power state.  For single shot low power support
+ * with wake-up notification, see
+ * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP below.  Access to mmap'd
+ * device regions is disabled on LOW_POWER_ENTRY and may only be resumed after
+ * calling LOW_POWER_EXIT.
+ */
+#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY 3
+
+/*
+ * This device feature has the same behavior as
+ * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY with the exception that the user
+ * provides an eventfd for wake-up notification.  When the device moves out of
+ * the low power state for the wake-up, the host will not allow the device to
+ * re-enter a low power state without a subsequent user call to one of the low
+ * power entry device feature IOCTLs.  Access to mmap'd device regions is
+ * disabled on LOW_POWER_ENTRY_WITH_WAKEUP and may only be resumed after the
+ * low power exit.  The low power exit can happen either through LOW_POWER_EXIT
+ * or through any other access (where the wake-up notification has been
+ * generated).  The access to mmap'd device regions will not trigger low power
+ * exit.
+ *
+ * The notification through the provided eventfd will be generated only when
+ * the device has entered and is resumed from a low power state after
+ * calling this device feature IOCTL.  A device that has not entered low power
+ * state, as managed through the runtime power management core, will not
+ * generate a notification through the provided eventfd on access.  Calling the
+ * LOW_POWER_EXIT feature is optional in the case where notification has been
+ * signaled on the provided eventfd that a resume from low power has occurred.
+ */
+struct vfio_device_low_power_entry_with_wakeup {
+	__s32 wakeup_eventfd;
+	__u32 reserved;
+};
+
+#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP 4
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, disallow use of device low power states as
+ * previously enabled via VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY or
+ * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device features.
+ * This device feature IOCTL may itself generate a wakeup eventfd notification
+ * in the latter case if the device had previously entered a low power state.
+ */
+#define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
cgit v1.2.3


From 44c51472bef83bb70b43e2f4b7a592096f32a855 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik@metanetworks.com>
Date: Wed, 31 Aug 2022 17:40:09 +0300
Subject: bpf: Support getting tunnel flags

Existing 'bpf_skb_get_tunnel_key' extracts various tunnel parameters
(id, ttl, tos, local and remote) but does not expose ip_tunnel_info's
tun_flags to the BPF program.

It makes sense to expose tun_flags to the BPF program.

Assume for example multiple GRE tunnels maintained on a single GRE
interface in collect_md mode. The program expects origins to initiate
over GRE, however different origins use different GRE characteristics
(e.g. some prefer to use GRE checksum, some do not; some pass a GRE key,
some do not, etc..).

A BPF program getting tun_flags can therefore remember the relevant
flags (e.g. TUNNEL_CSUM, TUNNEL_SEQ...) for each initiating remote. In
the reply path, the program can use 'bpf_skb_set_tunnel_key' in order
to correctly reply to the remote, using similar characteristics, based
on the stored tunnel flags.

Introduce BPF_F_TUNINFO_FLAGS flag for bpf_skb_get_tunnel_key. If
specified, 'bpf_tunnel_key->tunnel_flags' is set with the tun_flags.

Decided to use the existing unused 'tunnel_ext' as the storage for the
'tunnel_flags' in order to avoid changing bpf_tunnel_key's layout.

Also, the following has been considered during the design:

  1. Convert the "interesting" internal TUNNEL_xxx flags back to BPF_F_yyy
     and place into the new 'tunnel_flags' field. This has 2 drawbacks:

     - The BPF_F_yyy flags are from *set_tunnel_key* enumeration space,
       e.g. BPF_F_ZERO_CSUM_TX. It is awkward that it is "returned" into
       tunnel_flags from a *get_tunnel_key* call.
     - Not all "interesting" TUNNEL_xxx flags can be mapped to existing
       BPF_F_yyy flags, and it doesn't make sense to create new BPF_F_yyy
       flags just for purposes of the returned tunnel_flags.

  2. Place key.tun_flags into 'tunnel_flags' but mask them, keeping only
     "interesting" flags. That's ok, but the drawback is that what's
     "interesting" for my usecase might be limiting for other usecases.

Therefore I decided to expose what's in key.tun_flags *as is*, which seems
most flexible. The BPF user can just choose to ignore bits he's not
interested in. The TUNNEL_xxx are also UAPI, so no harm exposing them
back in the get_tunnel_key call.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220831144010.174110-1-shmulik.ladkani@gmail.com
---
 include/uapi/linux/bpf.h       | 10 +++++++++-
 net/core/filter.c              |  8 ++++++--
 tools/include/uapi/linux/bpf.h | 10 +++++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 962960a98835..837c0f9b7fdd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5659,6 +5659,11 @@ enum {
 	BPF_F_SEQ_NUMBER		= (1ULL << 3),
 };
 
+/* BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+	BPF_F_TUNINFO_FLAGS		= (1ULL << 4),
+};
+
 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
  * BPF_FUNC_perf_event_read_value flags.
  */
@@ -5848,7 +5853,10 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;	/* Padding, future use. */
+	union {
+		__u16 tunnel_ext;	/* compat */
+		__be16 tunnel_flags;
+	};
 	__u32 tunnel_label;
 	union {
 		__u32 local_ipv4;
diff --git a/net/core/filter.c b/net/core/filter.c
index 63e25d8ce501..74e2a4a0d747 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4488,7 +4488,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
 	void *to_orig = to;
 	int err;
 
-	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+					 BPF_F_TUNINFO_FLAGS)))) {
 		err = -EINVAL;
 		goto err_clear;
 	}
@@ -4520,7 +4521,10 @@ set_compat:
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
-	to->tunnel_ext = 0;
+	if (flags & BPF_F_TUNINFO_FLAGS)
+		to->tunnel_flags = info->key.tun_flags;
+	else
+		to->tunnel_ext = 0;
 
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4ba82a1eace..793103b10eab 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5659,6 +5659,11 @@ enum {
 	BPF_F_SEQ_NUMBER		= (1ULL << 3),
 };
 
+/* BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+	BPF_F_TUNINFO_FLAGS		= (1ULL << 4),
+};
+
 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
  * BPF_FUNC_perf_event_read_value flags.
  */
@@ -5848,7 +5853,10 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;	/* Padding, future use. */
+	union {
+		__u16 tunnel_ext;	/* compat */
+		__be16 tunnel_flags;
+	};
 	__u32 tunnel_label;
 	union {
 		__u32 local_ipv4;
-- 
cgit v1.2.3


From 5854a09b49574da5a77a0f36ad7b021a2661321d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 31 Aug 2022 14:12:42 -0500
Subject: net/ipv4: Use __DECLARE_FLEX_ARRAY() helper

We now have a cleaner way to keep compatibility with user-space
(a.k.a. not breaking it) when we need to keep in place a one-element
array (for its use in user-space) together with a flexible-array
member (for its use in kernel-space) without making it hard to read
at the source level. This is through the use of the new
__DECLARE_FLEX_ARRAY() helper macro.

The size and memory layout of the structure is preserved after the
changes. See below.

Before changes:

$ pahole -C ip_msfilter net/ipv4/igmp.o
struct ip_msfilter {
	union {
		struct {
			__be32     imsf_multiaddr_aux;   /*     0     4 */
			__be32     imsf_interface_aux;   /*     4     4 */
			__u32      imsf_fmode_aux;       /*     8     4 */
			__u32      imsf_numsrc_aux;      /*    12     4 */
			__be32     imsf_slist[1];        /*    16     4 */
		};                                       /*     0    20 */
		struct {
			__be32     imsf_multiaddr;       /*     0     4 */
			__be32     imsf_interface;       /*     4     4 */
			__u32      imsf_fmode;           /*     8     4 */
			__u32      imsf_numsrc;          /*    12     4 */
			__be32     imsf_slist_flex[0];   /*    16     0 */
		};                                       /*     0    16 */
	};                                               /*     0    20 */

	/* size: 20, cachelines: 1, members: 1 */
	/* last cacheline: 20 bytes */
};

After changes:

$ pahole -C ip_msfilter net/ipv4/igmp.o
struct ip_msfilter {
	__be32                     imsf_multiaddr;       /*     0     4 */
	__be32                     imsf_interface;       /*     4     4 */
	__u32                      imsf_fmode;           /*     8     4 */
	__u32                      imsf_numsrc;          /*    12     4 */
	union {
		__be32             imsf_slist[1];        /*    16     4 */
		struct {
			struct {
			} __empty_imsf_slist_flex;       /*    16     0 */
			__be32     imsf_slist_flex[0];   /*    16     0 */
		};                                       /*    16     0 */
	};                                               /*    16     4 */

	/* size: 20, cachelines: 1, members: 5 */
	/* last cacheline: 20 bytes */
};

In the past, we had to duplicate the whole original structure within
a union, and update the names of all the members. Now, we just need to
declare the flexible-array member to be used in kernel-space through
the __DECLARE_FLEX_ARRAY() helper together with the one-element array,
within a union. This makes the source code more clean and easier to read.

Link: https://github.com/KSPP/linux/issues/193
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 14168225cecd..578daa6f816b 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -188,21 +188,13 @@ struct ip_mreq_source {
 };
 
 struct ip_msfilter {
+	__be32		imsf_multiaddr;
+	__be32		imsf_interface;
+	__u32		imsf_fmode;
+	__u32		imsf_numsrc;
 	union {
-		struct {
-			__be32		imsf_multiaddr_aux;
-			__be32		imsf_interface_aux;
-			__u32		imsf_fmode_aux;
-			__u32		imsf_numsrc_aux;
-			__be32		imsf_slist[1];
-		};
-		struct {
-			__be32		imsf_multiaddr;
-			__be32		imsf_interface;
-			__u32		imsf_fmode;
-			__u32		imsf_numsrc;
-			__be32		imsf_slist_flex[];
-		};
+		__be32		imsf_slist[1];
+		__DECLARE_FLEX_ARRAY(__be32, imsf_slist_flex);
 	};
 };
 
-- 
cgit v1.2.3


From a36c421690b3e5dee38fc12abfcabda742f00064 Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Fri, 26 Aug 2022 10:00:31 -0700
Subject: wifi: nl80211: Add POWERED_ADDR_CHANGE feature

Add a new extended feature bit signifying that the wireless hardware
supports changing the MAC address while the underlying net_device is
powered. Note that this has a different meaning from
IFF_LIVE_ADDR_CHANGE as additional restrictions might be imposed by
the hardware, such as:

 - No connection is active on this interface, carrier is off
 - No scan is in progress
 - No offchannel operations are in progress

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 573db20403dc..a00a23840c57 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6281,6 +6281,14 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_RADAR_BACKGROUND: Device supports background radar/CAC
  *	detection.
  *
+ * @NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE: Device can perform a MAC address
+ *	change without having to bring the underlying network device down
+ *	first. For example, in station mode this can be used to vary the
+ *	origin MAC address prior to a connection to a new AP for privacy
+ *	or other reasons. Note that certain driver specific restrictions
+ *	might apply, e.g. no scans in progress, no offchannel operations
+ *	in progress, and no active connections.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6348,6 +6356,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_BSS_COLOR,
 	NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD,
 	NL80211_EXT_FEATURE_RADAR_BACKGROUND,
+	NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From 6522047c65764c9aaec8009e73daa8c0b138c701 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 2 Sep 2022 16:12:50 +0200
Subject: wifi: nl80211: add MLD address to assoc BSS entries

Add an MLD address attribute to BSS entries that the interface
is currently associated with to help userspace figure out what's
going on.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a00a23840c57..c32e7616a366 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4959,6 +4959,7 @@ enum nl80211_bss_scan_width {
  *	using the nesting index as the antenna number.
  * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz
  * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8).
+ * @NL80211_BSS_MLD_ADDR: MLD address of this BSS if connected to it.
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -4985,6 +4986,7 @@ enum nl80211_bss {
 	NL80211_BSS_CHAIN_SIGNAL,
 	NL80211_BSS_FREQUENCY_OFFSET,
 	NL80211_BSS_MLO_LINK_ID,
+	NL80211_BSS_MLD_ADDR,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4f5e5b763a15..c0b6629e3012 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10182,8 +10182,10 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			    (nla_put_u32(msg, NL80211_BSS_STATUS,
 					 NL80211_BSS_STATUS_ASSOCIATED) ||
 			     (wdev->valid_links &&
-			      nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID,
-					 link_id))))
+			      (nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID,
+					  link_id) ||
+			       nla_put(msg, NL80211_BSS_MLD_ADDR, ETH_ALEN,
+				       wdev->u.client.connected_addr)))))
 				goto nla_put_failure;
 		}
 		break;
-- 
cgit v1.2.3


From 835e699ef82adfc85ac4cc3f1f237c1adfdefd20 Mon Sep 17 00:00:00 2001
From: Jagath Jog J <jagathjog1996@gmail.com>
Date: Wed, 31 Aug 2022 12:01:16 +0530
Subject: iio: Add new event type gesture and use direction for single and
 double tap

Add new event type for tap called gesture and the direction can be used
to differentiate single and double tap. This may be used by accelerometer
sensors to express single and double tap events. For directional tap,
modifiers like IIO_MOD_(X/Y/Z) can be used along with singletap and
doubletap direction.

Signed-off-by: Jagath Jog J <jagathjog1996@gmail.com>
Link: https://lore.kernel.org/r/20220831063117.4141-2-jagathjog1996@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 69 +++++++++++++++++++++++++++++++++
 drivers/iio/industrialio-event.c        |  7 +++-
 include/linux/iio/types.h               |  2 +
 include/uapi/linux/iio/types.h          |  3 ++
 tools/iio/iio_event_monitor.c           |  8 +++-
 5 files changed, 87 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 80e8a38d1ee2..66e81c48ee21 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -2068,3 +2068,72 @@ Description:
 		individual channels. If multiple channels are enabled in a scan,
 		then the sampling_frequency of the scan may be computed from the
 		per channel sampling frequencies.
+
+What:		/sys/.../events/in_accel_gesture_singletap_en
+What:		/sys/.../events/in_accel_gesture_doubletap_en
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Device generates an event on a single or double tap.
+
+What:		/sys/.../events/in_accel_gesture_singletap_value
+What:		/sys/.../events/in_accel_gesture_doubletap_value
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the threshold value that the device is comparing
+		against to generate the tap gesture event. The lower
+		threshold value increases the sensitivity of tap detection.
+		Units and the exact meaning of value are device-specific.
+
+What:		/sys/.../events/in_accel_gesture_tap_value_available
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Lists all available threshold values which can be used to
+		modify the sensitivity of the tap detection.
+
+What:		/sys/.../events/in_accel_gesture_singletap_reset_timeout
+What:		/sys/.../events/in_accel_gesture_doubletap_reset_timeout
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the timeout value in seconds for the tap detector
+		to not to look for another tap event after the event as
+		occurred. Basically the minimum quiet time between the two
+		single-tap's or two double-tap's.
+
+What:		/sys/.../events/in_accel_gesture_tap_reset_timeout_available
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Lists all available tap reset timeout values. Units in seconds.
+
+What:		/sys/.../events/in_accel_gesture_doubletap_tap2_min_delay
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the minimum quiet time in seconds between the two
+		taps of a double tap.
+
+What:		/sys/.../events/in_accel_gesture_doubletap_tap2_min_delay_available
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Lists all available delay values between two taps in the double
+		tap. Units in seconds.
+
+What:		/sys/.../events/in_accel_gesture_tap_maxtomin_time
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Specifies the maximum time difference allowed between upper
+		and lower peak of tap to consider it as the valid tap event.
+		Units in seconds.
+
+What:		/sys/.../events/in_accel_gesture_tap_maxtomin_time_available
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Lists all available time values between upper peak to lower
+		peak. Units in seconds.
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 0e2056894965..3d78da2531a9 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -231,12 +231,15 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_CHANGE] = "change",
 	[IIO_EV_TYPE_MAG_REFERENCED] = "mag_referenced",
+	[IIO_EV_TYPE_GESTURE] = "gesture",
 };
 
 static const char * const iio_ev_dir_text[] = {
 	[IIO_EV_DIR_EITHER] = "either",
 	[IIO_EV_DIR_RISING] = "rising",
-	[IIO_EV_DIR_FALLING] = "falling"
+	[IIO_EV_DIR_FALLING] = "falling",
+	[IIO_EV_DIR_SINGLETAP] = "singletap",
+	[IIO_EV_DIR_DOUBLETAP] = "doubletap",
 };
 
 static const char * const iio_ev_info_text[] = {
@@ -247,6 +250,8 @@ static const char * const iio_ev_info_text[] = {
 	[IIO_EV_INFO_HIGH_PASS_FILTER_3DB] = "high_pass_filter_3db",
 	[IIO_EV_INFO_LOW_PASS_FILTER_3DB] = "low_pass_filter_3db",
 	[IIO_EV_INFO_TIMEOUT] = "timeout",
+	[IIO_EV_INFO_RESET_TIMEOUT] = "reset_timeout",
+	[IIO_EV_INFO_TAP2_MIN_DELAY] = "tap2_min_delay",
 };
 
 static enum iio_event_direction iio_ev_attr_dir(struct iio_dev_attr *attr)
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 27143b03909d..82faa98c719a 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -17,6 +17,8 @@ enum iio_event_info {
 	IIO_EV_INFO_HIGH_PASS_FILTER_3DB,
 	IIO_EV_INFO_LOW_PASS_FILTER_3DB,
 	IIO_EV_INFO_TIMEOUT,
+	IIO_EV_INFO_RESET_TIMEOUT,
+	IIO_EV_INFO_TAP2_MIN_DELAY,
 };
 
 #define IIO_VAL_INT 1
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 472cead10d8d..913864221ac4 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -105,6 +105,7 @@ enum iio_event_type {
 	IIO_EV_TYPE_MAG_ADAPTIVE,
 	IIO_EV_TYPE_CHANGE,
 	IIO_EV_TYPE_MAG_REFERENCED,
+	IIO_EV_TYPE_GESTURE,
 };
 
 enum iio_event_direction {
@@ -112,6 +113,8 @@ enum iio_event_direction {
 	IIO_EV_DIR_RISING,
 	IIO_EV_DIR_FALLING,
 	IIO_EV_DIR_NONE,
+	IIO_EV_DIR_SINGLETAP,
+	IIO_EV_DIR_DOUBLETAP,
 };
 
 #endif /* _UAPI_IIO_TYPES_H_ */
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index 2f4581658859..b3b3ea399f67 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -69,12 +69,15 @@ static const char * const iio_ev_type_text[] = {
 	[IIO_EV_TYPE_MAG_ADAPTIVE] = "mag_adaptive",
 	[IIO_EV_TYPE_CHANGE] = "change",
 	[IIO_EV_TYPE_MAG_REFERENCED] = "mag_referenced",
+	[IIO_EV_TYPE_GESTURE] = "gesture",
 };
 
 static const char * const iio_ev_dir_text[] = {
 	[IIO_EV_DIR_EITHER] = "either",
 	[IIO_EV_DIR_RISING] = "rising",
-	[IIO_EV_DIR_FALLING] = "falling"
+	[IIO_EV_DIR_FALLING] = "falling",
+	[IIO_EV_DIR_SINGLETAP] = "singletap",
+	[IIO_EV_DIR_DOUBLETAP] = "doubletap",
 };
 
 static const char * const iio_modifier_names[] = {
@@ -227,6 +230,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_TYPE_THRESH_ADAPTIVE:
 	case IIO_EV_TYPE_MAG_ADAPTIVE:
 	case IIO_EV_TYPE_CHANGE:
+	case IIO_EV_TYPE_GESTURE:
 		break;
 	default:
 		return false;
@@ -236,6 +240,8 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_EV_DIR_EITHER:
 	case IIO_EV_DIR_RISING:
 	case IIO_EV_DIR_FALLING:
+	case IIO_EV_DIR_SINGLETAP:
+	case IIO_EV_DIR_DOUBLETAP:
 	case IIO_EV_DIR_NONE:
 		break;
 	default:
-- 
cgit v1.2.3


From 27ed9353aec9de4277b3389c9f2b04beb6ab7622 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:26:57 -0700
Subject: bpf: Update descriptions for helpers bpf_get_func_arg[_cnt]()

Now instead of the number of arguments, the number of registers
holding argument values are stored in trampoline. Update
the description of bpf_get_func_arg[_cnt]() helpers. Previous
programs without struct arguments should continue to work
as usual.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152657.2078805-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 9 +++++----
 tools/include/uapi/linux/bpf.h | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 793103b10eab..3df78c56c1bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5079,12 +5079,12 @@ union bpf_attr {
  *
  * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
  *	Description
- *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		Get **n**-th argument register (zero based) of the traced function (for tracing programs)
  *		returned in **value**.
  *
  *	Return
  *		0 on success.
- *		**-EINVAL** if n >= arguments count of traced function.
+ *		**-EINVAL** if n >= argument register count of traced function.
  *
  * long bpf_get_func_ret(void *ctx, u64 *value)
  *	Description
@@ -5097,10 +5097,11 @@ union bpf_attr {
  *
  * long bpf_get_func_arg_cnt(void *ctx)
  *	Description
- *		Get number of arguments of the traced function (for tracing programs).
+ *		Get number of registers of the traced function (for tracing programs) where
+ *		function arguments are stored in these registers.
  *
  *	Return
- *		The number of arguments of the traced function.
+ *		The number of argument registers of the traced function.
  *
  * int bpf_get_retval(void)
  *	Description
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 793103b10eab..3df78c56c1bf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5079,12 +5079,12 @@ union bpf_attr {
  *
  * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
  *	Description
- *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		Get **n**-th argument register (zero based) of the traced function (for tracing programs)
  *		returned in **value**.
  *
  *	Return
  *		0 on success.
- *		**-EINVAL** if n >= arguments count of traced function.
+ *		**-EINVAL** if n >= argument register count of traced function.
  *
  * long bpf_get_func_ret(void *ctx, u64 *value)
  *	Description
@@ -5097,10 +5097,11 @@ union bpf_attr {
  *
  * long bpf_get_func_arg_cnt(void *ctx)
  *	Description
- *		Get number of arguments of the traced function (for tracing programs).
+ *		Get number of registers of the traced function (for tracing programs) where
+ *		function arguments are stored in these registers.
  *
  *	Return
- *		The number of arguments of the traced function.
+ *		The number of argument registers of the traced function.
  *
  * int bpf_get_retval(void)
  *	Description
-- 
cgit v1.2.3


From a0a4de4d897f5ce672e086cb6b9f91a306af6953 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 22 Aug 2022 16:41:21 +0200
Subject: netfilter: remove NFPROTO_DECNET

Decnet has been removed. so no need to reserve space in arrays for it.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index 53411ccc69db..5a79ccb76701 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -63,7 +63,9 @@ enum {
 	NFPROTO_NETDEV =  5,
 	NFPROTO_BRIDGE =  7,
 	NFPROTO_IPV6   = 10,
+#ifndef __KERNEL__ /* no longer supported by kernel */
 	NFPROTO_DECNET = 12,
+#endif
 	NFPROTO_NUMPROTO,
 };
 
-- 
cgit v1.2.3


From 42ee53f9bfd3e4cf58ae7656e0d11075f5fe8489 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 8 Sep 2022 21:34:41 +0300
Subject: vfio: Introduce DMA logging uAPIs

DMA logging allows a device to internally record what DMAs the device is
initiating and report them back to userspace. It is part of the VFIO
migration infrastructure that allows implementing dirty page tracking
during the pre copy phase of live migration. Only DMA WRITEs are logged,
and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE.

This patch introduces the DMA logging involved uAPIs.

It uses the FEATURE ioctl with its GET/SET/PROBE options as of below.

It exposes a PROBE option to detect if the device supports DMA logging.
It exposes a SET option to start device DMA logging in given IOVAs
ranges.
It exposes a SET option to stop device DMA logging that was previously
started.
It exposes a GET option to read back and clear the device DMA log.

Extra details exist as part of vfio.h per a specific option.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20220908183448.195262-4-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 76a173f973de..d7d8e0922376 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1042,6 +1042,92 @@ struct vfio_device_low_power_entry_with_wakeup {
  */
 #define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5
 
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET start/stop device DMA logging.
+ * VFIO_DEVICE_FEATURE_PROBE can be used to detect if the device supports
+ * DMA logging.
+ *
+ * DMA logging allows a device to internally record what DMAs the device is
+ * initiating and report them back to userspace. It is part of the VFIO
+ * migration infrastructure that allows implementing dirty page tracking
+ * during the pre copy phase of live migration. Only DMA WRITEs are logged,
+ * and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE.
+ *
+ * When DMA logging is started a range of IOVAs to monitor is provided and the
+ * device can optimize its logging to cover only the IOVA range given. Each
+ * DMA that the device initiates inside the range will be logged by the device
+ * for later retrieval.
+ *
+ * page_size is an input that hints what tracking granularity the device
+ * should try to achieve. If the device cannot do the hinted page size then
+ * it's the driver choice which page size to pick based on its support.
+ * On output the device will return the page size it selected.
+ *
+ * ranges is a pointer to an array of
+ * struct vfio_device_feature_dma_logging_range.
+ *
+ * The core kernel code guarantees to support by minimum num_ranges that fit
+ * into a single kernel page. User space can try higher values but should give
+ * up if the above can't be achieved as of some driver limitations.
+ *
+ * A single call to start device DMA logging can be issued and a matching stop
+ * should follow at the end. Another start is not allowed in the meantime.
+ */
+struct vfio_device_feature_dma_logging_control {
+	__aligned_u64 page_size;
+	__u32 num_ranges;
+	__u32 __reserved;
+	__aligned_u64 ranges;
+};
+
+struct vfio_device_feature_dma_logging_range {
+	__aligned_u64 iova;
+	__aligned_u64 length;
+};
+
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET stop device DMA logging that was started
+ * by VFIO_DEVICE_FEATURE_DMA_LOGGING_START
+ */
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_GET read back and clear the device DMA log
+ *
+ * Query the device's DMA log for written pages within the given IOVA range.
+ * During querying the log is cleared for the IOVA range.
+ *
+ * bitmap is a pointer to an array of u64s that will hold the output bitmap
+ * with 1 bit reporting a page_size unit of IOVA. The mapping of IOVA to bits
+ * is given by:
+ *  bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64))
+ *
+ * The input page_size can be any power of two value and does not have to
+ * match the value given to VFIO_DEVICE_FEATURE_DMA_LOGGING_START. The driver
+ * will format its internal logging to match the reporting page size, possibly
+ * by replicating bits if the internal page size is lower than requested.
+ *
+ * The LOGGING_REPORT will only set bits in the bitmap and never clear or
+ * perform any initialization of the user provided bitmap.
+ *
+ * If any error is returned userspace should assume that the dirty log is
+ * corrupted. Error recovery is to consider all memory dirty and try to
+ * restart the dirty tracking, or to abort/restart the whole migration.
+ *
+ * If DMA logging is not enabled, an error will be returned.
+ *
+ */
+struct vfio_device_feature_dma_logging_report {
+	__aligned_u64 iova;
+	__aligned_u64 length;
+	__aligned_u64 page_size;
+	__aligned_u64 bitmap;
+};
+
+#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
cgit v1.2.3


From 825cf206ed510c4a1758bef8957e2b039253e2e3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:44 -0700
Subject: statx: add direct I/O alignment information

Traditionally, the conditions for when DIO (direct I/O) is supported
were fairly simple.  For both block devices and regular files, DIO had
to be aligned to the logical block size of the block device.

However, due to filesystem features that have been added over time (e.g.
multi-device support, data journalling, inline data, encryption, verity,
compression, checkpoint disabling, log-structured mode), the conditions
for when DIO is allowed on a regular file have gotten increasingly
complex.  Whether a particular regular file supports DIO, and with what
alignment, can depend on various file attributes and filesystem mount
options, as well as which block device(s) the file's data is located on.

Moreover, the general rule of DIO needing to be aligned to the block
device's logical block size was recently relaxed to allow user buffers
(but not file offsets) aligned to the DMA alignment instead.  See
commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io").

XFS has an ioctl XFS_IOC_DIOINFO that exposes DIO alignment information.
Uplifting this to the VFS is one possibility.  However, as discussed
(https://lore.kernel.org/linux-fsdevel/20220120071215.123274-1-ebiggers@kernel.org/T/#u),
this ioctl is rarely used and not known to be used outside of
XFS-specific code.  It was also never intended to indicate when a file
doesn't support DIO at all, nor was it intended for block devices.

Therefore, let's expose this information via statx().  Add the
STATX_DIOALIGN flag and two new statx fields associated with it:

* stx_dio_mem_align: the alignment (in bytes) required for user memory
  buffers for DIO, or 0 if DIO is not supported on the file.

* stx_dio_offset_align: the alignment (in bytes) required for file
  offsets and I/O segment lengths for DIO, or 0 if DIO is not supported
  on the file.  This will only be nonzero if stx_dio_mem_align is
  nonzero, and vice versa.

Note that as with other statx() extensions, if STATX_DIOALIGN isn't set
in the returned statx struct, then these new fields won't be filled in.
This will happen if the file is neither a regular file nor a block
device, or if the file is a regular file and the filesystem doesn't
support STATX_DIOALIGN.  It might also happen if the caller didn't
include STATX_DIOALIGN in the request mask, since statx() isn't required
to return unrequested information.

This commit only adds the VFS-level plumbing for STATX_DIOALIGN.  For
regular files, individual filesystems will still need to add code to
support it.  For block devices, a separate commit will wire it up too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-2-ebiggers@kernel.org
---
 fs/stat.c                 | 2 ++
 include/linux/stat.h      | 2 ++
 include/uapi/linux/stat.h | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..a7930d744483 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -611,6 +611,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_dev_major = MAJOR(stat->dev);
 	tmp.stx_dev_minor = MINOR(stat->dev);
 	tmp.stx_mnt_id = stat->mnt_id;
+	tmp.stx_dio_mem_align = stat->dio_mem_align;
+	tmp.stx_dio_offset_align = stat->dio_offset_align;
 
 	return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..ff277ced50e9 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,8 @@ struct kstat {
 	struct timespec64 btime;			/* File creation time */
 	u64		blocks;
 	u64		mnt_id;
+	u32		dio_mem_align;
+	u32		dio_offset_align;
 };
 
 #endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..7cab2c65d3d7 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -124,7 +124,8 @@ struct statx {
 	__u32	stx_dev_minor;
 	/* 0x90 */
 	__u64	stx_mnt_id;
-	__u64	__spare2;
+	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
+	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
 	/* 0xa0 */
 	__u64	__spare3[12];	/* Spare space for future expansion */
 	/* 0x100 */
@@ -152,6 +153,7 @@ struct statx {
 #define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
 #define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
 #define STATX_MNT_ID		0x00001000U	/* Got stx_mnt_id */
+#define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
-- 
cgit v1.2.3


From 2d5de004e009add27db76c5cdc9f1f7f7dc087e7 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Mon, 8 Aug 2022 10:56:11 -0700
Subject: userfaultfd: add /dev/userfaultfd for fine grained access control

Historically, it has been shown that intercepting kernel faults with
userfaultfd (thereby forcing the kernel to wait for an arbitrary amount of
time) can be exploited, or at least can make some kinds of exploits
easier.  So, in 37cd0575b8 "userfaultfd: add UFFD_USER_MODE_ONLY" we
changed things so, in order for kernel faults to be handled by
userfaultfd, either the process needs CAP_SYS_PTRACE, or this sysctl must
be configured so that any unprivileged user can do it.

In a typical implementation of a hypervisor with live migration (take
QEMU/KVM as one such example), we do indeed need to be able to handle
kernel faults.  But, both options above are less than ideal:

- Toggling the sysctl increases attack surface by allowing any
  unprivileged user to do it.

- Granting the live migration process CAP_SYS_PTRACE gives it this
  ability, but *also* the ability to "observe and control the
  execution of another process [...], and examine and change [its]
  memory and registers" (from ptrace(2)). This isn't something we need
  or want to be able to do, so granting this permission violates the
  "principle of least privilege".

This is all a long winded way to say: we want a more fine-grained way to
grant access to userfaultfd, without granting other additional permissions
at the same time.

To achieve this, add a /dev/userfaultfd misc device.  This device provides
an alternative to the userfaultfd(2) syscall for the creation of new
userfaultfds.  The idea is, any userfaultfds created this way will be able
to handle kernel faults, without the caller having any special
capabilities.  Access to this mechanism is instead restricted using e.g.
standard filesystem permissions.

[axelrasmussen@google.com: Handle misc_register() failure properly]
  Link: https://lkml.kernel.org/r/20220819205201.658693-3-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20220808175614.3885028-3-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Nadav Amit <namit@vmware.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhang Yi <yi.zhang@huawei.com>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 71 +++++++++++++++++++++++++++++++---------
 include/uapi/linux/userfaultfd.h |  4 +++
 2 files changed, 59 insertions(+), 16 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 175de70e3adf..4de91ba9e85e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swapops.h>
+#include <linux/miscdevice.h>
 
 int sysctl_unprivileged_userfaultfd __read_mostly;
 
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	if (ctx->features & UFFD_FEATURE_SIGBUS)
 		goto out;
-	if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
-	    ctx->flags & UFFD_USER_MODE_ONLY) {
-		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
-			"sysctl knob to 1 if kernel faults must be handled "
-			"without obtaining CAP_SYS_PTRACE capability\n");
+	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
 		goto out;
-	}
 
 	/*
 	 * If it's already released don't get it. This avoids to loop
@@ -2056,20 +2052,11 @@ static void init_once_userfaultfd_ctx(void *mem)
 	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
 }
 
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static int new_userfaultfd(int flags)
 {
 	struct userfaultfd_ctx *ctx;
 	int fd;
 
-	if (!sysctl_unprivileged_userfaultfd &&
-	    (flags & UFFD_USER_MODE_ONLY) == 0 &&
-	    !capable(CAP_SYS_PTRACE)) {
-		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
-			"sysctl knob to 1 if kernel faults must be handled "
-			"without obtaining CAP_SYS_PTRACE capability\n");
-		return -EPERM;
-	}
-
 	BUG_ON(!current->mm);
 
 	/* Check the UFFD_* constants for consistency.  */
@@ -2102,8 +2089,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 	return fd;
 }
 
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+	/* Userspace-only page faults are always allowed */
+	if (flags & UFFD_USER_MODE_ONLY)
+		return true;
+
+	/*
+	 * The user is requesting a userfaultfd which can handle kernel faults.
+	 * Privileged users are always allowed to do this.
+	 */
+	if (capable(CAP_SYS_PTRACE))
+		return true;
+
+	/* Otherwise, access to kernel fault handling is sysctl controlled. */
+	return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+	if (!userfaultfd_syscall_allowed(flags))
+		return -EPERM;
+
+	return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+	if (cmd != USERFAULTFD_IOC_NEW)
+		return -EINVAL;
+
+	return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+	.unlocked_ioctl = userfaultfd_dev_ioctl,
+	.compat_ioctl = userfaultfd_dev_ioctl,
+	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "userfaultfd",
+	.fops = &userfaultfd_dev_fops
+};
+
 static int __init userfaultfd_init(void)
 {
+	int ret;
+
+	ret = misc_register(&userfaultfd_misc);
+	if (ret)
+		return ret;
+
 	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
 						sizeof(struct userfaultfd_ctx),
 						0,
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7d32b1e797fb..005e5e306266 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -12,6 +12,10 @@
 
 #include <linux/types.h>
 
+/* ioctls for /dev/userfaultfd */
+#define USERFAULTFD_IOC 0xAA
+#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
+
 /*
  * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
  * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
-- 
cgit v1.2.3


From b4e12b2d70fd9eccdb3cef8015dc1788ca38e3fd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 8 Sep 2022 14:41:04 -0700
Subject: perf: Kill __PERF_SAMPLE_CALLCHAIN_EARLY

There's no in-tree user anymore.  Let's get rid of it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220908214104.3851807-3-namhyung@kernel.org
---
 arch/x86/events/amd/ibs.c       | 10 ----------
 arch/x86/events/intel/core.c    |  3 ---
 include/uapi/linux/perf_event.h |  2 --
 3 files changed, 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index dab094166693..ce5720bfb350 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event)
 	hwc->config_base = perf_ibs->msr;
 	hwc->config = config;
 
-	/*
-	 * rip recorded by IbsOpRip will not be consistent with rsp and rbp
-	 * recorded as part of interrupt regs. Thus we need to use rip from
-	 * interrupt regs while unwinding call stack. Setting _EARLY flag
-	 * makes sure we unwind call-stack before perf sample rip is set to
-	 * IbsOpRip.
-	 */
-	if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-		event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
-
 	return 0;
 }
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7f4e7e6b45f0..b16c91ac9219 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3868,9 +3868,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		}
 		if (x86_pmu.pebs_aliases)
 			x86_pmu.pebs_aliases(event);
-
-		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-			event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
 	}
 
 	if (needs_branch_stack(event)) {
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index dca16582885f..e639c74cf5fb 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -164,8 +164,6 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_WEIGHT_STRUCT		= 1U << 24,
 
 	PERF_SAMPLE_MAX = 1U << 25,		/* non-ABI */
-
-	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
 
 #define PERF_SAMPLE_WEIGHT_TYPE	(PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
-- 
cgit v1.2.3


From 061834624c87282c6d9d8c5395aaff4380e5e1fc Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:21 +0200
Subject: can: set CANFD_FDF flag in all CAN FD frame structures

To simplify the testing in user space all struct canfd_frame's provided by
the CAN subsystem of the Linux kernel now have the CANFD_FDF flag set in
canfd_frame::flags.

NB: Handcrafted ETH_P_CANFD frames introduced via PF_PACKET socket might
not set this bit correctly. During the check for sufficient headroom in
PF_PACKET sk_buffs the uninitialized CAN sk_buff data structures are filled.
In the case of a CAN FD frame the CANFD_FDF flag is set accordingly.

As the CAN frame content is already zero initialized in alloc_canfd_skb()
the obsolete initialization of cf->flags in the CTU CAN FD driver has been
removed as it would overwrite the already set CANFD_FDF flag.

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-4-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/ctucanfd/ctucanfd_base.c |  1 -
 drivers/net/can/dev/skb.c                | 11 +++++++++++
 include/uapi/linux/can.h                 |  4 ++--
 net/can/af_can.c                         |  5 +++++
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c
index 3c18d028bd8c..c4026712ab7d 100644
--- a/drivers/net/can/ctucanfd/ctucanfd_base.c
+++ b/drivers/net/can/ctucanfd/ctucanfd_base.c
@@ -657,7 +657,6 @@ static void ctucan_read_rx_frame(struct ctucan_priv *priv, struct canfd_frame *c
 		cf->can_id = (idw >> 18) & CAN_SFF_MASK;
 
 	/* BRS, ESI, RTR Flags */
-	cf->flags = 0;
 	if (FIELD_GET(REG_FRAME_FORMAT_W_FDF, ffw)) {
 		if (FIELD_GET(REG_FRAME_FORMAT_W_BRS, ffw))
 			cf->flags |= CANFD_BRS;
diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index b896e1ce3b47..adb413bdd734 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -244,6 +244,9 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 
 	*cfd = skb_put_zero(skb, sizeof(struct canfd_frame));
 
+	/* set CAN FD flag by default */
+	(*cfd)->flags = CANFD_FDF;
+
 	return skb;
 }
 EXPORT_SYMBOL_GPL(alloc_canfd_skb);
@@ -287,6 +290,14 @@ static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb)
 		skb_reset_mac_header(skb);
 		skb_reset_network_header(skb);
 		skb_reset_transport_header(skb);
+
+		/* set CANFD_FDF flag for CAN FD frames */
+		if (can_is_canfd_skb(skb)) {
+			struct canfd_frame *cfd;
+
+			cfd = (struct canfd_frame *)skb->data;
+			cfd->flags |= CANFD_FDF;
+		}
 	}
 
 	return true;
diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 90801ada2bbe..7b23eeeb3273 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -141,8 +141,8 @@ struct can_frame {
  * When this is done the former differentiation via CAN_MTU / CANFD_MTU gets
  * lost. CANFD_FDF allows programmers to mark CAN FD frames in the case of
  * using struct canfd_frame for mixed CAN / CAN FD content (dual use).
- * N.B. the Kernel APIs do NOT provide mixed CAN / CAN FD content inside of
- * struct canfd_frame therefore the CANFD_FDF flag is disregarded by Linux.
+ * Since the introduction of CAN XL the CANFD_FDF flag is set in all CAN FD
+ * frame structures provided by the CAN subsystem of the Linux kernel.
  */
 #define CANFD_BRS 0x01 /* bit rate switch (second bitrate for payload data) */
 #define CANFD_ESI 0x02 /* error state indicator of the transmitting node */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index afa6c2151bc4..072a6a5c9dd1 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -205,7 +205,12 @@ int can_send(struct sk_buff *skb, int loop)
 	if (can_is_can_skb(skb)) {
 		skb->protocol = htons(ETH_P_CAN);
 	} else if (can_is_canfd_skb(skb)) {
+		struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
 		skb->protocol = htons(ETH_P_CANFD);
+
+		/* set CAN FD flag for CAN FD frames by default */
+		cfd->flags |= CANFD_FDF;
 	} else {
 		goto inval_skb;
 	}
-- 
cgit v1.2.3


From 1a3e3034c049503ec6992a4a7d573e7fff31fac4 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:22 +0200
Subject: can: canxl: introduce CAN XL data structure

This patch adds defines for data structures and length information for
CAN XL (CAN with eXtended data Length) which can transfer up to 2048
byte inside a single frame.

Notable changes from CAN FD:

- the 11 bit arbitration field is now named 'priority' instead of 'can_id'
  (there are no 29 bit identifiers nor RTR frames anymore)
- the data length needs a uint16 value to cover up to 2048 byte
  (the length element position is different to struct can[fd]_frame)
- new fields (SDT, AF) and a SEC bit have been introduced
- the virtual CAN interface identifier is not part if the CAN XL frame
  struct as this VCID value is stored in struct skbuff (analog to vlan id)

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-5-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 7b23eeeb3273..dd645ea72306 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -48,6 +48,7 @@
 
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/stddef.h> /* for offsetof */
 
 /* controller area network (CAN) kernel definitions */
 
@@ -60,6 +61,7 @@
 #define CAN_SFF_MASK 0x000007FFU /* standard frame format (SFF) */
 #define CAN_EFF_MASK 0x1FFFFFFFU /* extended frame format (EFF) */
 #define CAN_ERR_MASK 0x1FFFFFFFU /* omit EFF, RTR, ERR flags */
+#define CANXL_PRIO_MASK CAN_SFF_MASK /* 11 bit priority mask */
 
 /*
  * Controller Area Network Identifier structure
@@ -73,6 +75,7 @@ typedef __u32 canid_t;
 
 #define CAN_SFF_ID_BITS		11
 #define CAN_EFF_ID_BITS		29
+#define CANXL_PRIO_BITS		CAN_SFF_ID_BITS
 
 /*
  * Controller Area Network Error Message Frame Mask structure
@@ -91,6 +94,16 @@ typedef __u32 can_err_mask_t;
 #define CANFD_MAX_DLC 15
 #define CANFD_MAX_DLEN 64
 
+/*
+ * CAN XL payload length and DLC definitions according to ISO 11898-1
+ * CAN XL DLC ranges from 0 .. 2047 => data length from 1 .. 2048 byte
+ */
+#define CANXL_MIN_DLC 0
+#define CANXL_MAX_DLC 2047
+#define CANXL_MAX_DLC_MASK 0x07FF
+#define CANXL_MIN_DLEN 1
+#define CANXL_MAX_DLEN 2048
+
 /**
  * struct can_frame - Classical CAN frame structure (aka CAN 2.0B)
  * @can_id:   CAN ID of the frame and CAN_*_FLAG flags, see canid_t definition
@@ -166,8 +179,46 @@ struct canfd_frame {
 	__u8    data[CANFD_MAX_DLEN] __attribute__((aligned(8)));
 };
 
+/*
+ * defined bits for canxl_frame.flags
+ *
+ * The canxl_frame.flags element contains two bits CANXL_XLF and CANXL_SEC
+ * and shares the relative position of the struct can[fd]_frame.len element.
+ * The CANXL_XLF bit ALWAYS needs to be set to indicate a valid CAN XL frame.
+ * As a side effect setting this bit intentionally breaks the length checks
+ * for Classical CAN and CAN FD frames.
+ *
+ * Undefined bits in canxl_frame.flags are reserved and shall be set to zero.
+ */
+#define CANXL_XLF 0x80 /* mandatory CAN XL frame flag (must always be set!) */
+#define CANXL_SEC 0x01 /* Simple Extended Content (security/segmentation) */
+
+/**
+ * struct canxl_frame - CAN with e'X'tended frame 'L'ength frame structure
+ * @prio:  11 bit arbitration priority with zero'ed CAN_*_FLAG flags
+ * @flags: additional flags for CAN XL
+ * @sdt:   SDU (service data unit) type
+ * @len:   frame payload length in byte (CANXL_MIN_DLEN .. CANXL_MAX_DLEN)
+ * @af:    acceptance field
+ * @data:  CAN XL frame payload (CANXL_MIN_DLEN .. CANXL_MAX_DLEN byte)
+ *
+ * @prio shares the same position as @can_id from struct can[fd]_frame.
+ */
+struct canxl_frame {
+	canid_t prio;  /* 11 bit priority for arbitration (canid_t) */
+	__u8    flags; /* additional flags for CAN XL */
+	__u8    sdt;   /* SDU (service data unit) type */
+	__u16   len;   /* frame payload length in byte */
+	__u32   af;    /* acceptance field */
+	__u8    data[CANXL_MAX_DLEN];
+};
+
 #define CAN_MTU		(sizeof(struct can_frame))
 #define CANFD_MTU	(sizeof(struct canfd_frame))
+#define CANXL_MTU	(sizeof(struct canxl_frame))
+#define CANXL_HDR_SIZE	(offsetof(struct canxl_frame, data))
+#define CANXL_MIN_MTU	(CANXL_HDR_SIZE + 64)
+#define CANXL_MAX_MTU	CANXL_MTU
 
 /* particular protocols of the protocol family PF_CAN */
 #define CAN_RAW		1 /* RAW sockets */
-- 
cgit v1.2.3


From fb08cba12b52cba4366e858932307649dc5304e2 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:23 +0200
Subject: can: canxl: update CAN infrastructure for CAN XL frames

- add new ETH_P_CANXL ethernet protocol type
- update skb checks for CAN XL
- add alloc_canxl_skb() which now needs a data length parameter
- introduce init_can_skb_reserve() to reduce code duplication

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-6-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/skb.c     | 72 ++++++++++++++++++++++++++++++++-----------
 include/linux/can/skb.h       | 23 +++++++++++++-
 include/uapi/linux/if_ether.h |  1 +
 net/can/af_can.c              | 25 ++++++++++++++-
 4 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index adb413bdd734..791a51e2f5d6 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -187,6 +187,20 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx,
 }
 EXPORT_SYMBOL_GPL(can_free_echo_skb);
 
+/* fill common values for CAN sk_buffs */
+static void init_can_skb_reserve(struct sk_buff *skb)
+{
+	skb->pkt_type = PACKET_BROADCAST;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->skbcnt = 0;
+}
+
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 {
 	struct sk_buff *skb;
@@ -200,16 +214,8 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 	}
 
 	skb->protocol = htons(ETH_P_CAN);
-	skb->pkt_type = PACKET_BROADCAST;
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-
-	can_skb_reserve(skb);
+	init_can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
-	can_skb_prv(skb)->skbcnt = 0;
 
 	*cf = skb_put_zero(skb, sizeof(struct can_frame));
 
@@ -231,16 +237,8 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 	}
 
 	skb->protocol = htons(ETH_P_CANFD);
-	skb->pkt_type = PACKET_BROADCAST;
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-
-	can_skb_reserve(skb);
+	init_can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
-	can_skb_prv(skb)->skbcnt = 0;
 
 	*cfd = skb_put_zero(skb, sizeof(struct canfd_frame));
 
@@ -251,6 +249,39 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(alloc_canfd_skb);
 
+struct sk_buff *alloc_canxl_skb(struct net_device *dev,
+				struct canxl_frame **cxl,
+				unsigned int data_len)
+{
+	struct sk_buff *skb;
+
+	if (data_len < CANXL_MIN_DLEN || data_len > CANXL_MAX_DLEN)
+		goto out_error;
+
+	skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) +
+			       CANXL_HDR_SIZE + data_len);
+	if (unlikely(!skb))
+		goto out_error;
+
+	skb->protocol = htons(ETH_P_CANXL);
+	init_can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = dev->ifindex;
+
+	*cxl = skb_put_zero(skb, CANXL_HDR_SIZE + data_len);
+
+	/* set CAN XL flag and length information by default */
+	(*cxl)->flags = CANXL_XLF;
+	(*cxl)->len = data_len;
+
+	return skb;
+
+out_error:
+	*cxl = NULL;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_canxl_skb);
+
 struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
 {
 	struct sk_buff *skb;
@@ -319,6 +350,11 @@ bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb)
 			goto inval_skb;
 		break;
 
+	case ETH_P_CANXL:
+		if (!can_is_canxl_skb(skb))
+			goto inval_skb;
+		break;
+
 	default:
 		goto inval_skb;
 	}
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index ddffc2fc008c..1abc25a8d144 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -30,6 +30,9 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx,
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
+struct sk_buff *alloc_canxl_skb(struct net_device *dev,
+				struct canxl_frame **cxl,
+				unsigned int data_len);
 struct sk_buff *alloc_can_err_skb(struct net_device *dev,
 				  struct can_frame **cf);
 bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb);
@@ -114,11 +117,29 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN);
 }
 
-/* get length element value from can[fd]_frame structure */
+static inline bool can_is_canxl_skb(const struct sk_buff *skb)
+{
+	const struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
+
+	if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU)
+		return false;
+
+	/* this also checks valid CAN XL data length boundaries */
+	if (skb->len != CANXL_HDR_SIZE + cxl->len)
+		return false;
+
+	return cxl->flags & CANXL_XLF;
+}
+
+/* get length element value from can[|fd|xl]_frame structure */
 static inline unsigned int can_skb_get_len_val(struct sk_buff *skb)
 {
+	const struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
 	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 
+	if (can_is_canxl_skb(skb))
+		return cxl->len;
+
 	return cfd->len;
 }
 
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index d370165bc621..69e0457eb200 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -138,6 +138,7 @@
 #define ETH_P_LOCALTALK 0x0009		/* Localtalk pseudo type 	*/
 #define ETH_P_CAN	0x000C		/* CAN: Controller Area Network */
 #define ETH_P_CANFD	0x000D		/* CANFD: CAN flexible data rate*/
+#define ETH_P_CANXL	0x000E		/* CANXL: eXtended frame Length */
 #define ETH_P_PPPTALK	0x0010		/* Dummy type for Atalk over PPP*/
 #define ETH_P_TR_802_2	0x0011		/* 802.2 frames 		*/
 #define ETH_P_MOBITEX	0x0015		/* Mobitex (kaz@cafe.net)	*/
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 072a6a5c9dd1..9503ab10f9b8 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -202,7 +202,9 @@ int can_send(struct sk_buff *skb, int loop)
 	struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats;
 	int err = -EINVAL;
 
-	if (can_is_can_skb(skb)) {
+	if (can_is_canxl_skb(skb)) {
+		skb->protocol = htons(ETH_P_CANXL);
+	} else if (can_is_can_skb(skb)) {
 		skb->protocol = htons(ETH_P_CAN);
 	} else if (can_is_canfd_skb(skb)) {
 		struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
@@ -702,6 +704,21 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
 	return NET_RX_SUCCESS;
 }
 
+static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
+		     struct packet_type *pt, struct net_device *orig_dev)
+{
+	if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canxl_skb(skb)))) {
+		pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
+			     dev->type, skb->len);
+
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	can_receive(skb, dev);
+	return NET_RX_SUCCESS;
+}
+
 /* af_can protocol functions */
 
 /**
@@ -826,6 +843,11 @@ static struct packet_type canfd_packet __read_mostly = {
 	.func = canfd_rcv,
 };
 
+static struct packet_type canxl_packet __read_mostly = {
+	.type = cpu_to_be16(ETH_P_CANXL),
+	.func = canxl_rcv,
+};
+
 static const struct net_proto_family can_family_ops = {
 	.family = PF_CAN,
 	.create = can_create,
@@ -865,6 +887,7 @@ static __init int can_init(void)
 
 	dev_add_pack(&can_packet);
 	dev_add_pack(&canfd_packet);
+	dev_add_pack(&canxl_packet);
 
 	return 0;
 
-- 
cgit v1.2.3


From 626332696d7506e8f844a564277bdba2dc78fcb5 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Sep 2022 19:07:25 +0200
Subject: can: raw: add CAN XL support

Enable CAN_RAW sockets to read and write CAN XL frames analogue to the
CAN FD extension (new CAN_RAW_XL_FRAMES sockopt).

A CAN XL network interface is capable to handle Classical CAN, CAN FD and
CAN XL frames. When CAN_RAW_XL_FRAMES is enabled, the CAN_RAW socket checks
whether the addressed CAN network interface is capable to handle the
provided CAN frame.

In opposite to the fixed number of bytes for
- CAN frames (CAN_MTU = sizeof(struct can_frame))
- CAN FD frames (CANFD_MTU = sizeof(struct can_frame))
the number of bytes when reading/writing CAN XL frames depends on the
number of data bytes. For efficiency reasons the length of the struct
canxl_frame is truncated to the needed size for read/write operations.
This leads to a calculated size of CANXL_HDR_SIZE + canxl_frame::len which
is enforced on write() operations and guaranteed on read() operations.

NB: Valid length values are 1 .. 2048 (CANXL_MIN_DLEN .. CANXL_MAX_DLEN).

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20220912170725.120748-8-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/raw.h |  1 +
 net/can/raw.c                | 55 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h
index 3386aa81fdf2..ff12f525c37c 100644
--- a/include/uapi/linux/can/raw.h
+++ b/include/uapi/linux/can/raw.h
@@ -62,6 +62,7 @@ enum {
 	CAN_RAW_RECV_OWN_MSGS,	/* receive my own msgs (default:off) */
 	CAN_RAW_FD_FRAMES,	/* allow CAN FD frames (default:off) */
 	CAN_RAW_JOIN_FILTERS,	/* all filters must match to trigger */
+	CAN_RAW_XL_FRAMES,	/* allow CAN XL frames (default:off) */
 };
 
 #endif /* !_UAPI_CAN_RAW_H */
diff --git a/net/can/raw.c b/net/can/raw.c
index e7dfa3584e29..3eb7d3e2b541 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -50,6 +50,7 @@
 #include <linux/skbuff.h>
 #include <linux/can.h>
 #include <linux/can/core.h>
+#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */
 #include <linux/can/skb.h>
 #include <linux/can/raw.h>
 #include <net/sock.h>
@@ -87,6 +88,7 @@ struct raw_sock {
 	int loopback;
 	int recv_own_msgs;
 	int fd_frames;
+	int xl_frames;
 	int join_filters;
 	int count;                 /* number of active filters */
 	struct can_filter dfilter; /* default/single filter */
@@ -129,8 +131,9 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 	if (!ro->recv_own_msgs && oskb->sk == sk)
 		return;
 
-	/* do not pass non-CAN2.0 frames to a legacy socket */
-	if (!ro->fd_frames && oskb->len != CAN_MTU)
+	/* make sure to not pass oversized frames to the socket */
+	if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) ||
+	    (can_is_canxl_skb(oskb) && !ro->xl_frames))
 		return;
 
 	/* eliminate multiple filter matches for the same skb */
@@ -345,6 +348,7 @@ static int raw_init(struct sock *sk)
 	ro->loopback         = 1;
 	ro->recv_own_msgs    = 0;
 	ro->fd_frames        = 0;
+	ro->xl_frames        = 0;
 	ro->join_filters     = 0;
 
 	/* alloc_percpu provides zero'ed memory */
@@ -668,6 +672,15 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 
 		break;
 
+	case CAN_RAW_XL_FRAMES:
+		if (optlen != sizeof(ro->xl_frames))
+			return -EINVAL;
+
+		if (copy_from_sockptr(&ro->xl_frames, optval, optlen))
+			return -EFAULT;
+
+		break;
+
 	case CAN_RAW_JOIN_FILTERS:
 		if (optlen != sizeof(ro->join_filters))
 			return -EINVAL;
@@ -750,6 +763,12 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
 		val = &ro->fd_frames;
 		break;
 
+	case CAN_RAW_XL_FRAMES:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = &ro->xl_frames;
+		break;
+
 	case CAN_RAW_JOIN_FILTERS:
 		if (len > sizeof(int))
 			len = sizeof(int);
@@ -775,7 +794,11 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	struct sk_buff *skb;
 	struct net_device *dev;
 	int ifindex;
-	int err;
+	int err = -EINVAL;
+
+	/* check for valid CAN frame sizes */
+	if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU)
+		return -EINVAL;
 
 	if (msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
@@ -795,15 +818,6 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	if (!dev)
 		return -ENXIO;
 
-	err = -EINVAL;
-	if (ro->fd_frames && dev->mtu == CANFD_MTU) {
-		if (unlikely(size != CANFD_MTU && size != CAN_MTU))
-			goto put_dev;
-	} else {
-		if (unlikely(size != CAN_MTU))
-			goto put_dev;
-	}
-
 	skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv),
 				  msg->msg_flags & MSG_DONTWAIT, &err);
 	if (!skb)
@@ -813,10 +827,27 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	can_skb_prv(skb)->ifindex = dev->ifindex;
 	can_skb_prv(skb)->skbcnt = 0;
 
+	/* fill the skb before testing for valid CAN frames */
 	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
 		goto free_skb;
 
+	err = -EINVAL;
+	if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) {
+		/* CAN XL, CAN FD and Classical CAN */
+		if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) &&
+		    !can_is_can_skb(skb))
+			goto free_skb;
+	} else if (ro->fd_frames && dev->mtu == CANFD_MTU) {
+		/* CAN FD and Classical CAN */
+		if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb))
+			goto free_skb;
+	} else {
+		/* Classical CAN */
+		if (!can_is_can_skb(skb))
+			goto free_skb;
+	}
+
 	sockcm_init(&sockc, sk);
 	if (msg->msg_controllen) {
 		err = sock_cmsg_send(sk, msg, &sockc);
-- 
cgit v1.2.3


From 7e6e1b57162ed6a2d32d2f0929c27d79482ff706 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 6 Sep 2022 11:55:58 +0200
Subject: rtnetlink: advertise allmulti counter

Like what was done with IFLA_PROMISCUITY, add IFLA_ALLMULTI to advertise
the allmulti counter.
The flag IFF_ALLMULTI is advertised only if it was directly set by a
userland app.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 net/core/rtnetlink.c         | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e36d9d2c65a7..0bfa9a99ebb6 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -370,6 +370,7 @@ enum {
 	IFLA_GRO_MAX_SIZE,
 	IFLA_TSO_MAX_SIZE,
 	IFLA_TSO_MAX_SEGS,
+	IFLA_ALLMULTI,		/* Allmulti count: > 0 means acts ALLMULTI */
 
 	__IFLA_MAX
 };
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f5e87fe57c83..8de9fb32db08 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1057,6 +1057,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_CARRIER */
 	       + nla_total_size(4) /* IFLA_PROMISCUITY */
+	       + nla_total_size(4) /* IFLA_ALLMULTI */
 	       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
 	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
@@ -1765,6 +1766,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
 	    nla_put_u32(skb, IFLA_GROUP, dev->group) ||
 	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+	    nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) ||
 	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
 	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
 	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
@@ -1926,6 +1928,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_GRO_MAX_SIZE]	= { .type = NLA_U32 },
 	[IFLA_TSO_MAX_SIZE]	= { .type = NLA_REJECT },
 	[IFLA_TSO_MAX_SEGS]	= { .type = NLA_REJECT },
+	[IFLA_ALLMULTI]		= { .type = NLA_REJECT },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
-- 
cgit v1.2.3


From 65b32f801bfbc54dc98144a6ec26082b59d131ee Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Thu, 8 Sep 2022 10:16:40 -0700
Subject: uapi: move IPPROTO_L2TP to in.h

IPPROTO_L2TP is currently defined in l2tp.h, but most of
ip protocols are defined in in.h file. Move it there in order
to keep code clean.

Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/in.h   | 2 ++
 include/uapi/linux/l2tp.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 578daa6f816b..f243ce665f74 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -68,6 +68,8 @@ enum {
 #define IPPROTO_PIM		IPPROTO_PIM
   IPPROTO_COMP = 108,		/* Compression Header Protocol		*/
 #define IPPROTO_COMP		IPPROTO_COMP
+  IPPROTO_L2TP = 115,		/* Layer 2 Tunnelling Protocol		*/
+#define IPPROTO_L2TP		IPPROTO_L2TP
   IPPROTO_SCTP = 132,		/* Stream Control Transport Protocol	*/
 #define IPPROTO_SCTP		IPPROTO_SCTP
   IPPROTO_UDPLITE = 136,	/* UDP-Lite (RFC 3828)			*/
diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index bab8c9708611..7d81c3e1ec29 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -13,8 +13,6 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 
-#define IPPROTO_L2TP		115
-
 /**
  * struct sockaddr_l2tpip - the sockaddr structure for L2TP-over-IP sockets
  * @l2tp_family:  address family number AF_L2TPIP.
-- 
cgit v1.2.3


From 8b189ea08c334f25dbb3d076f8adb8b80491d01d Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Thu, 8 Sep 2022 10:16:42 -0700
Subject: net/sched: flower: Add L2TPv3 filter

Add support for matching on L2TPv3 session ID.
Session ID can be specified only when ip proto was
set to IPPROTO_L2TP.

Example filter:
  # tc filter add dev $PF1 ingress prio 1 protocol ip \
      flower \
        ip_proto l2tp \
        l2tpv3_sid 1234 \
        skip_sw \
      action mirred egress redirect dev $VF1_PR

Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 877309d6ca3c..648a82f32666 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -592,6 +592,8 @@ enum {
 	TCA_FLOWER_KEY_PPPOE_SID,	/* be16 */
 	TCA_FLOWER_KEY_PPP_PROTO,	/* be16 */
 
+	TCA_FLOWER_KEY_L2TPV3_SID,	/* be32 */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 041d63ff809a..22d32b82bc09 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -69,6 +69,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_hash hash;
 	struct flow_dissector_key_num_of_vlans num_of_vlans;
 	struct flow_dissector_key_pppoe pppoe;
+	struct flow_dissector_key_l2tpv3 l2tpv3;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -712,6 +713,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_NUM_OF_VLANS]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_PPPOE_SID]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_PPP_PROTO]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_L2TPV3_SID]	= { .type = NLA_U32 },
 
 };
 
@@ -1790,6 +1792,11 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA,
 			       mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
 			       sizeof(key->arp.tha));
+	} else if (key->basic.ip_proto == IPPROTO_L2TP) {
+		fl_set_key_val(tb, &key->l2tpv3.session_id,
+			       TCA_FLOWER_KEY_L2TPV3_SID,
+			       &mask->l2tpv3.session_id, TCA_FLOWER_UNSPEC,
+			       sizeof(key->l2tpv3.session_id));
 	}
 
 	if (key->basic.ip_proto == IPPROTO_TCP ||
@@ -1970,6 +1977,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_PPPOE, pppoe);
+	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+			     FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3);
 
 	skb_flow_dissector_init(dissector, keys, cnt);
 }
@@ -3196,6 +3205,13 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 				  mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK,
 				  sizeof(key->arp.tha))))
 		goto nla_put_failure;
+	else if (key->basic.ip_proto == IPPROTO_L2TP &&
+		 fl_dump_key_val(skb, &key->l2tpv3.session_id,
+				 TCA_FLOWER_KEY_L2TPV3_SID,
+				 &mask->l2tpv3.session_id,
+				 TCA_FLOWER_UNSPEC,
+				 sizeof(key->l2tpv3.session_id)))
+		goto nla_put_failure;
 
 	if ((key->basic.ip_proto == IPPROTO_TCP ||
 	     key->basic.ip_proto == IPPROTO_UDP ||
-- 
cgit v1.2.3


From 95f510d0b792f308d3d748242fe960c35bdc2c62 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 11 Sep 2022 04:06:59 +0300
Subject: net: dsa: allow the DSA master to be seen and changed through
 rtnetlink

Some DSA switches have multiple CPU ports, which can be used to improve
CPU termination throughput, but DSA, through dsa_tree_setup_cpu_ports(),
sets up only the first one, leading to suboptimal use of hardware.

The desire is to not change the default configuration but to permit the
user to create a dynamic mapping between individual user ports and the
CPU port that they are served by, configurable through rtnetlink. It is
also intended to permit load balancing between CPU ports, and in that
case, the foreseen model is for the DSA master to be a bonding interface
whose lowers are the physical DSA masters.

To that end, we create a struct rtnl_link_ops for DSA user ports with
the "dsa" kind. We expose the IFLA_DSA_MASTER link attribute that
contains the ifindex of the newly desired DSA master.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dsa.h            |   8 +++
 include/uapi/linux/if_link.h |  10 ++++
 net/dsa/Makefile             |  10 +++-
 net/dsa/dsa.c                |   9 +++
 net/dsa/dsa2.c               |  14 +++++
 net/dsa/dsa_priv.h           |  10 ++++
 net/dsa/netlink.c            |  62 +++++++++++++++++++++
 net/dsa/port.c               | 130 +++++++++++++++++++++++++++++++++++++++++++
 net/dsa/slave.c              | 120 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 372 insertions(+), 1 deletion(-)
 create mode 100644 net/dsa/netlink.c

(limited to 'include/uapi/linux')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 23eac1bda843..3f717c3fcba0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -559,6 +559,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 	list_for_each_entry((_dp), &(_dst)->ports, list) \
 		if (dsa_port_is_user((_dp)))
 
+#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \
+	list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \
+		if (dsa_port_is_user((_dp)))
+
 #define dsa_tree_for_each_cpu_port(_dp, _dst) \
 	list_for_each_entry((_dp), &(_dst)->ports, list) \
 		if (dsa_port_is_cpu((_dp)))
@@ -830,6 +834,10 @@ struct dsa_switch_ops {
 	int	(*connect_tag_protocol)(struct dsa_switch *ds,
 					enum dsa_tag_protocol proto);
 
+	int	(*port_change_master)(struct dsa_switch *ds, int port,
+				      struct net_device *master,
+				      struct netlink_ext_ack *extack);
+
 	/* Optional switch-wide initialization and destruction methods */
 	int	(*setup)(struct dsa_switch *ds);
 	void	(*teardown)(struct dsa_switch *ds);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 0bfa9a99ebb6..3d39fb398d65 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1375,4 +1375,14 @@ enum {
 
 #define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1)
 
+/* DSA section */
+
+enum {
+	IFLA_DSA_UNSPEC,
+	IFLA_DSA_MASTER,
+	__IFLA_DSA_MAX,
+};
+
+#define IFLA_DSA_MAX	(__IFLA_DSA_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index af28c24ead18..bf57ef3bce2a 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,7 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o tag_8021q.o
+dsa_core-y += \
+	dsa.o \
+	dsa2.o \
+	master.o \
+	netlink.o \
+	port.o \
+	slave.o \
+	switch.o \
+	tag_8021q.o
 
 # tagging formats
 obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index be7b320cda76..64b14f655b23 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -536,8 +536,16 @@ static int __init dsa_init_module(void)
 	dsa_tag_driver_register(&DSA_TAG_DRIVER_NAME(none_ops),
 				THIS_MODULE);
 
+	rc = rtnl_link_register(&dsa_link_ops);
+	if (rc)
+		goto netlink_register_fail;
+
 	return 0;
 
+netlink_register_fail:
+	dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops));
+	dsa_slave_unregister_notifier();
+	dev_remove_pack(&dsa_pack_type);
 register_notifier_fail:
 	destroy_workqueue(dsa_owq);
 
@@ -547,6 +555,7 @@ module_init(dsa_init_module);
 
 static void __exit dsa_cleanup_module(void)
 {
+	rtnl_link_unregister(&dsa_link_ops);
 	dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops));
 
 	dsa_slave_unregister_notifier();
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index f1f96e2e56aa..42422ddea59b 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -387,6 +387,20 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
 	return NULL;
 }
 
+struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst)
+{
+	struct device_node *ethernet;
+	struct net_device *master;
+	struct dsa_port *cpu_dp;
+
+	cpu_dp = dsa_tree_find_first_cpu(dst);
+	ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
+	master = of_find_net_device_by_node(ethernet);
+	of_node_put(ethernet);
+
+	return master;
+}
+
 /* Assign the default CPU port (the first one in the tree) to all ports of the
  * fabric which don't already have one as part of their own switch.
  */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index c48c5c8ba790..d252a04ed725 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -200,6 +200,9 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
 	return NULL;
 }
 
+/* netlink.c */
+extern struct rtnl_link_ops dsa_link_ops __read_mostly;
+
 /* port.c */
 void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
 			       const struct dsa_device_ops *tag_ops);
@@ -292,6 +295,8 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
 int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
+int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
+			   struct netlink_ext_ack *extack);
 
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
@@ -305,8 +310,12 @@ int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
 int dsa_slave_register_notifier(void);
 void dsa_slave_unregister_notifier(void);
+void dsa_slave_sync_ha(struct net_device *dev);
+void dsa_slave_unsync_ha(struct net_device *dev);
 void dsa_slave_setup_tagger(struct net_device *slave);
 int dsa_slave_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
+			    struct netlink_ext_ack *extack);
 int dsa_slave_manage_vlan_filtering(struct net_device *dev,
 				    bool vlan_filtering);
 
@@ -542,6 +551,7 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
 				  const struct net_device *lag_dev);
+struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst);
 int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v);
 int dsa_broadcast(unsigned long e, void *v);
 int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
new file mode 100644
index 000000000000..0f43bbb94769
--- /dev/null
+++ b/net/dsa/netlink.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2022 NXP
+ */
+#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
+
+#include "dsa_priv.h"
+
+static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
+	[IFLA_DSA_MASTER]	= { .type = NLA_U32 },
+};
+
+static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
+			  struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	int err;
+
+	if (!data)
+		return 0;
+
+	if (data[IFLA_DSA_MASTER]) {
+		u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]);
+		struct net_device *master;
+
+		master = __dev_get_by_index(dev_net(dev), ifindex);
+		if (!master)
+			return -EINVAL;
+
+		err = dsa_slave_change_master(dev, master, extack);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static size_t dsa_get_size(const struct net_device *dev)
+{
+	return nla_total_size(sizeof(u32)) +	/* IFLA_DSA_MASTER  */
+	       0;
+}
+
+static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct net_device *master = dsa_slave_to_master(dev);
+
+	if (nla_put_u32(skb, IFLA_DSA_MASTER, master->ifindex))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+struct rtnl_link_ops dsa_link_ops __read_mostly = {
+	.kind			= "dsa",
+	.priv_size		= sizeof(struct dsa_port),
+	.maxtype		= IFLA_DSA_MAX,
+	.policy			= dsa_policy,
+	.changelink		= dsa_changelink,
+	.get_size		= dsa_get_size,
+	.fill_info		= dsa_fill_info,
+};
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 4183e60db4f9..e557b42de9f2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/if_bridge.h>
+#include <linux/netdevice.h>
 #include <linux/notifier.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
@@ -1374,6 +1375,135 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
 	return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp);
 }
 
+static int dsa_port_assign_master(struct dsa_port *dp,
+				  struct net_device *master,
+				  struct netlink_ext_ack *extack,
+				  bool fail_on_err)
+{
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index, err;
+
+	err = ds->ops->port_change_master(ds, port, master, extack);
+	if (err && !fail_on_err)
+		dev_err(ds->dev, "port %d failed to assign master %s: %pe\n",
+			port, master->name, ERR_PTR(err));
+
+	if (err && fail_on_err)
+		return err;
+
+	dp->cpu_dp = master->dsa_ptr;
+
+	return 0;
+}
+
+/* Change the dp->cpu_dp affinity for a user port. Note that both cross-chip
+ * notifiers and drivers have implicit assumptions about user-to-CPU-port
+ * mappings, so we unfortunately cannot delay the deletion of the objects
+ * (switchdev, standalone addresses, standalone VLANs) on the old CPU port
+ * until the new CPU port has been set up. So we need to completely tear down
+ * the old CPU port before changing it, and restore it on errors during the
+ * bringup of the new one.
+ */
+int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
+			   struct netlink_ext_ack *extack)
+{
+	struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp);
+	struct net_device *old_master = dsa_port_to_master(dp);
+	struct net_device *dev = dp->slave;
+	struct dsa_switch *ds = dp->ds;
+	bool vlan_filtering;
+	int err, tmp;
+
+	/* Bridges may hold host FDB, MDB and VLAN objects. These need to be
+	 * migrated, so dynamically unoffload and later reoffload the bridge
+	 * port.
+	 */
+	if (bridge_dev) {
+		dsa_port_pre_bridge_leave(dp, bridge_dev);
+		dsa_port_bridge_leave(dp, bridge_dev);
+	}
+
+	/* The port might still be VLAN filtering even if it's no longer
+	 * under a bridge, either due to ds->vlan_filtering_is_global or
+	 * ds->needs_standalone_vlan_filtering. In turn this means VLANs
+	 * on the CPU port.
+	 */
+	vlan_filtering = dsa_port_is_vlan_filtering(dp);
+	if (vlan_filtering) {
+		err = dsa_slave_manage_vlan_filtering(dev, false);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Failed to remove standalone VLANs");
+			goto rewind_old_bridge;
+		}
+	}
+
+	/* Standalone addresses, and addresses of upper interfaces like
+	 * VLAN, LAG, HSR need to be migrated.
+	 */
+	dsa_slave_unsync_ha(dev);
+
+	err = dsa_port_assign_master(dp, master, extack, true);
+	if (err)
+		goto rewind_old_addrs;
+
+	dsa_slave_sync_ha(dev);
+
+	if (vlan_filtering) {
+		err = dsa_slave_manage_vlan_filtering(dev, true);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Failed to restore standalone VLANs");
+			goto rewind_new_addrs;
+		}
+	}
+
+	if (bridge_dev) {
+		err = dsa_port_bridge_join(dp, bridge_dev, extack);
+		if (err && err == -EOPNOTSUPP) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Failed to reoffload bridge");
+			goto rewind_new_vlan;
+		}
+	}
+
+	return 0;
+
+rewind_new_vlan:
+	if (vlan_filtering)
+		dsa_slave_manage_vlan_filtering(dev, false);
+
+rewind_new_addrs:
+	dsa_slave_unsync_ha(dev);
+
+	dsa_port_assign_master(dp, old_master, NULL, false);
+
+/* Restore the objects on the old CPU port */
+rewind_old_addrs:
+	dsa_slave_sync_ha(dev);
+
+	if (vlan_filtering) {
+		tmp = dsa_slave_manage_vlan_filtering(dev, true);
+		if (tmp) {
+			dev_err(ds->dev,
+				"port %d failed to restore standalone VLANs: %pe\n",
+				dp->index, ERR_PTR(tmp));
+		}
+	}
+
+rewind_old_bridge:
+	if (bridge_dev) {
+		tmp = dsa_port_bridge_join(dp, bridge_dev, extack);
+		if (tmp) {
+			dev_err(ds->dev,
+				"port %d failed to rejoin bridge %s: %pe\n",
+				dp->index, bridge_dev->name, ERR_PTR(tmp));
+		}
+	}
+
+	return err;
+}
+
 void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
 			       const struct dsa_device_ops *tag_ops)
 {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 55094b94a5ae..00df6cf07866 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -164,6 +164,48 @@ static int dsa_slave_unsync_mc(struct net_device *dev,
 	return dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL, addr, 0);
 }
 
+void dsa_slave_sync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_slave_sync_mc(dev, ha->addr);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_slave_sync_uc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
+void dsa_slave_unsync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_slave_unsync_uc(dev, ha->addr);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_slave_unsync_mc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
 /* slave mii_bus handling ***************************************************/
 static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
 {
@@ -2346,6 +2388,7 @@ int dsa_slave_create(struct dsa_port *port)
 	if (slave_dev == NULL)
 		return -ENOMEM;
 
+	slave_dev->rtnl_link_ops = &dsa_link_ops;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
 #if IS_ENABLED(CONFIG_DCB)
 	slave_dev->dcbnl_ops = &dsa_slave_dcbnl_ops;
@@ -2462,6 +2505,83 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 	free_netdev(slave_dev);
 }
 
+int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *old_master = dsa_slave_to_master(dev);
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct net_device *upper;
+	struct list_head *iter;
+	int err;
+
+	if (master == old_master)
+		return 0;
+
+	if (!ds->ops->port_change_master) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Driver does not support changing DSA master");
+		return -EOPNOTSUPP;
+	}
+
+	if (!netdev_uses_dsa(master)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Interface not eligible as DSA master");
+		return -EOPNOTSUPP;
+	}
+
+	netdev_for_each_upper_dev_rcu(master, upper, iter) {
+		if (dsa_slave_dev_check(upper))
+			continue;
+		if (netif_is_bridge_master(upper))
+			continue;
+		NL_SET_ERR_MSG_MOD(extack, "Cannot join master with unknown uppers");
+		return -EOPNOTSUPP;
+	}
+
+	/* Since we allow live-changing the DSA master, plus we auto-open the
+	 * DSA master when the user port opens => we need to ensure that the
+	 * new DSA master is open too.
+	 */
+	if (dev->flags & IFF_UP) {
+		err = dev_open(master, extack);
+		if (err)
+			return err;
+	}
+
+	netdev_upper_dev_unlink(old_master, dev);
+
+	err = netdev_upper_dev_link(master, dev, extack);
+	if (err)
+		goto out_revert_old_master_unlink;
+
+	err = dsa_port_change_master(dp, master, extack);
+	if (err)
+		goto out_revert_master_link;
+
+	/* Update the MTU of the new CPU port through cross-chip notifiers */
+	err = dsa_slave_change_mtu(dev, dev->mtu);
+	if (err && err != -EOPNOTSUPP) {
+		netdev_warn(dev,
+			    "nonfatal error updating MTU with new master: %pe\n",
+			    ERR_PTR(err));
+	}
+
+	/* If the port doesn't have its own MAC address and relies on the DSA
+	 * master's one, inherit it again from the new DSA master.
+	 */
+	if (is_zero_ether_addr(dp->mac))
+		eth_hw_addr_inherit(dev, master);
+
+	return 0;
+
+out_revert_master_link:
+	netdev_upper_dev_unlink(master, dev);
+out_revert_old_master_unlink:
+	netdev_upper_dev_link(old_master, dev, NULL);
+	return err;
+}
+
 bool dsa_slave_dev_check(const struct net_device *dev)
 {
 	return dev->netdev_ops == &dsa_slave_netdev_ops;
-- 
cgit v1.2.3


From 848f3c0d47694924536e2894cb349613201321c6 Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Mon, 12 Sep 2022 19:16:18 +0200
Subject: seg6: add NEXT-C-SID support for SRv6 End behavior

The NEXT-C-SID mechanism described in [1] offers the possibility of
encoding several SRv6 segments within a single 128 bit SID address. Such
a SID address is called a Compressed SID (C-SID) container. In this way,
the length of the SID List can be drastically reduced.

A SID instantiated with the NEXT-C-SID flavor considers an IPv6 address
logically structured in three main blocks: i) Locator-Block; ii)
Locator-Node Function; iii) Argument.

                        C-SID container
+------------------------------------------------------------------+
|     Locator-Block      |Loc-Node|            Argument            |
|                        |Function|                                |
+------------------------------------------------------------------+
<--------- B -----------> <- NF -> <------------- A --------------->

   (i) The Locator-Block can be any IPv6 prefix available to the provider;

  (ii) The Locator-Node Function represents the node and the function to
       be triggered when a packet is received on the node;

 (iii) The Argument carries the remaining C-SIDs in the current C-SID
       container.

The NEXT-C-SID mechanism relies on the "flavors" framework defined in
[2]. The flavors represent additional operations that can modify or
extend a subset of the existing behaviors.

This patch introduces the support for flavors in SRv6 End behavior
implementing the NEXT-C-SID one. An SRv6 End behavior with NEXT-C-SID
flavor works as an End behavior but it is capable of processing the
compressed SID List encoded in C-SID containers.

An SRv6 End behavior with NEXT-C-SID flavor can be configured to support
user-provided Locator-Block and Locator-Node Function lengths. In this
implementation, such lengths must be evenly divisible by 8 (i.e. must be
byte-aligned), otherwise the kernel informs the user about invalid
values with a meaningful error code and message through netlink_ext_ack.

If Locator-Block and/or Locator-Node Function lengths are not provided
by the user during configuration of an SRv6 End behavior instance with
NEXT-C-SID flavor, the kernel will choose their default values i.e.,
32-bit Locator-Block and 16-bit Locator-Node Function.

[1] - https://datatracker.ietf.org/doc/html/draft-ietf-spring-srv6-srh-compression
[2] - https://datatracker.ietf.org/doc/html/rfc8986

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/seg6_local.h |  24 +++
 net/ipv6/seg6_local.c           | 335 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 356 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index 332b18f318f8..4fdc424c9cb3 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -28,6 +28,7 @@ enum {
 	SEG6_LOCAL_BPF,
 	SEG6_LOCAL_VRFTABLE,
 	SEG6_LOCAL_COUNTERS,
+	SEG6_LOCAL_FLAVORS,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -110,4 +111,27 @@ enum {
 
 #define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1)
 
+/* SRv6 End* Flavor attributes */
+enum {
+	SEG6_LOCAL_FLV_UNSPEC,
+	SEG6_LOCAL_FLV_OPERATION,
+	SEG6_LOCAL_FLV_LCBLOCK_BITS,
+	SEG6_LOCAL_FLV_LCNODE_FN_BITS,
+	__SEG6_LOCAL_FLV_MAX,
+};
+
+#define SEG6_LOCAL_FLV_MAX (__SEG6_LOCAL_FLV_MAX - 1)
+
+/* Designed flavor operations for SRv6 End* Behavior */
+enum {
+	SEG6_LOCAL_FLV_OP_UNSPEC,
+	SEG6_LOCAL_FLV_OP_PSP,
+	SEG6_LOCAL_FLV_OP_USP,
+	SEG6_LOCAL_FLV_OP_USD,
+	SEG6_LOCAL_FLV_OP_NEXT_CSID,
+	__SEG6_LOCAL_FLV_OP_MAX
+};
+
+#define SEG6_LOCAL_FLV_OP_MAX (__SEG6_LOCAL_FLV_OP_MAX - 1)
+
 #endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index f43e6f0baac1..8370726ae7bf 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -73,6 +73,55 @@ struct bpf_lwt_prog {
 	char *name;
 };
 
+/* default length values (expressed in bits) for both Locator-Block and
+ * Locator-Node Function.
+ *
+ * Both SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS *must* be:
+ *    i) greater than 0;
+ *   ii) evenly divisible by 8. In other terms, the lengths of the
+ *	 Locator-Block and Locator-Node Function must be byte-aligned (we can
+ *	 relax this constraint in the future if really needed).
+ *
+ * Moreover, a third condition must hold:
+ *  iii) SEG6_LOCAL_LCBLOCK_DBITS + SEG6_LOCAL_LCNODE_FN_DBITS <= 128.
+ *
+ * The correctness of SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS
+ * values are checked during the kernel compilation. If the compilation stops,
+ * check the value of these parameters to see if they meet conditions (i), (ii)
+ * and (iii).
+ */
+#define SEG6_LOCAL_LCBLOCK_DBITS	32
+#define SEG6_LOCAL_LCNODE_FN_DBITS	16
+
+/* The following next_csid_chk_{cntr,lcblock,lcblock_fn}_bits macros can be
+ * used directly to check whether the lengths (in bits) of Locator-Block and
+ * Locator-Node Function are valid according to (i), (ii), (iii).
+ */
+#define next_csid_chk_cntr_bits(blen, flen)		\
+	((blen) + (flen) > 128)
+
+#define next_csid_chk_lcblock_bits(blen)		\
+({							\
+	typeof(blen) __tmp = blen;			\
+	(!__tmp || __tmp > 120 || (__tmp & 0x07));	\
+})
+
+#define next_csid_chk_lcnode_fn_bits(flen)		\
+	next_csid_chk_lcblock_bits(flen)
+
+/* Supported Flavor operations are reported in this bitmask */
+#define SEG6_LOCAL_FLV_SUPP_OPS	(BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID))
+
+struct seg6_flavors_info {
+	/* Flavor operations */
+	__u32 flv_ops;
+
+	/* Locator-Block length, expressed in bits */
+	__u8 lcblock_bits;
+	/* Locator-Node Function length, expressed in bits*/
+	__u8 lcnode_func_bits;
+};
+
 enum seg6_end_dt_mode {
 	DT_INVALID_MODE	= -EINVAL,
 	DT_LEGACY_MODE	= 0,
@@ -136,6 +185,8 @@ struct seg6_local_lwt {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	struct seg6_end_dt_info dt_info;
 #endif
+	struct seg6_flavors_info flv_info;
+
 	struct pcpu_seg6_local_counters __percpu *pcpu_counters;
 
 	int headroom;
@@ -271,8 +322,50 @@ int seg6_lookup_nexthop(struct sk_buff *skb,
 	return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false);
 }
 
-/* regular endpoint function */
-static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo)
+{
+	return finfo->lcblock_bits >> 3;
+}
+
+static __u8 seg6_flv_lcnode_func_octects(const struct seg6_flavors_info *finfo)
+{
+	return finfo->lcnode_func_bits >> 3;
+}
+
+static bool seg6_next_csid_is_arg_zero(const struct in6_addr *addr,
+				       const struct seg6_flavors_info *finfo)
+{
+	__u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+	__u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+	__u8 arg_octects;
+	int i;
+
+	arg_octects = 16 - blk_octects - fnc_octects;
+	for (i = 0; i < arg_octects; ++i) {
+		if (addr->s6_addr[blk_octects + fnc_octects + i] != 0x00)
+			return false;
+	}
+
+	return true;
+}
+
+/* assume that DA.Argument length > 0 */
+static void seg6_next_csid_advance_arg(struct in6_addr *addr,
+				       const struct seg6_flavors_info *finfo)
+{
+	__u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+	__u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+
+	/* advance DA.Argument */
+	memmove(&addr->s6_addr[blk_octects],
+		&addr->s6_addr[blk_octects + fnc_octects],
+		16 - blk_octects - fnc_octects);
+
+	memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects);
+}
+
+static int input_action_end_core(struct sk_buff *skb,
+				 struct seg6_local_lwt *slwt)
 {
 	struct ipv6_sr_hdr *srh;
 
@@ -291,6 +384,38 @@ drop:
 	return -EINVAL;
 }
 
+static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	const struct seg6_flavors_info *finfo = &slwt->flv_info;
+	struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+	if (seg6_next_csid_is_arg_zero(daddr, finfo))
+		return input_action_end_core(skb, slwt);
+
+	/* update DA */
+	seg6_next_csid_advance_arg(daddr, finfo);
+
+	seg6_lookup_nexthop(skb, NULL, 0);
+
+	return dst_input(skb);
+}
+
+static bool seg6_next_csid_enabled(__u32 fops)
+{
+	return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID);
+}
+
+/* regular endpoint function */
+static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	const struct seg6_flavors_info *finfo = &slwt->flv_info;
+
+	if (seg6_next_csid_enabled(finfo->flv_ops))
+		return end_next_csid_core(skb, slwt);
+
+	return input_action_end_core(skb, slwt);
+}
+
 /* regular endpoint, and forward to specified nexthop */
 static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 {
@@ -951,7 +1076,8 @@ static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
 		.attrs		= 0,
-		.optattrs	= SEG6_F_LOCAL_COUNTERS,
+		.optattrs	= SEG6_F_LOCAL_COUNTERS |
+				  SEG6_F_ATTR(SEG6_LOCAL_FLAVORS),
 		.input		= input_action_end,
 	},
 	{
@@ -1132,6 +1258,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
 	[SEG6_LOCAL_COUNTERS]	= { .type = NLA_NESTED },
+	[SEG6_LOCAL_FLAVORS]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt,
@@ -1551,6 +1678,192 @@ static void destroy_attr_counters(struct seg6_local_lwt *slwt)
 	free_percpu(slwt->pcpu_counters);
 }
 
+static const
+struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = {
+	[SEG6_LOCAL_FLV_OPERATION]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_FLV_LCBLOCK_BITS]	= { .type = NLA_U8 },
+	[SEG6_LOCAL_FLV_LCNODE_FN_BITS]	= { .type = NLA_U8 },
+};
+
+/* check whether the lengths of the Locator-Block and Locator-Node Function
+ * are compatible with the dimension of a C-SID container.
+ */
+static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len)
+{
+	/* Locator-Block and Locator-Node Function cannot exceed 128 bits
+	 * (i.e. C-SID container lenghts).
+	 */
+	if (next_csid_chk_cntr_bits(block_len, func_len))
+		return -EINVAL;
+
+	/* Locator-Block length must be greater than zero and evenly divisible
+	 * by 8. There must be room for a Locator-Node Function, at least.
+	 */
+	if (next_csid_chk_lcblock_bits(block_len))
+		return -EINVAL;
+
+	/* Locator-Node Function length must be greater than zero and evenly
+	 * divisible by 8. There must be room for the Locator-Block.
+	 */
+	if (next_csid_chk_lcnode_fn_bits(func_len))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int seg6_parse_nla_next_csid_cfg(struct nlattr **tb,
+					struct seg6_flavors_info *finfo,
+					struct netlink_ext_ack *extack)
+{
+	__u8 func_len = SEG6_LOCAL_LCNODE_FN_DBITS;
+	__u8 block_len = SEG6_LOCAL_LCBLOCK_DBITS;
+	int rc;
+
+	if (tb[SEG6_LOCAL_FLV_LCBLOCK_BITS])
+		block_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]);
+
+	if (tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS])
+		func_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]);
+
+	rc = seg6_chk_next_csid_cfg(block_len, func_len);
+	if (rc < 0) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid Locator Block/Node Function lengths");
+		return rc;
+	}
+
+	finfo->lcblock_bits = block_len;
+	finfo->lcnode_func_bits = func_len;
+
+	return 0;
+}
+
+static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+			     struct netlink_ext_ack *extack)
+{
+	struct seg6_flavors_info *finfo = &slwt->flv_info;
+	struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1];
+	unsigned long fops;
+	int rc;
+
+	rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX,
+					 attrs[SEG6_LOCAL_FLAVORS],
+					 seg6_local_flavors_policy, NULL);
+	if (rc < 0)
+		return rc;
+
+	/* this attribute MUST always be present since it represents the Flavor
+	 * operation(s) to be carried out.
+	 */
+	if (!tb[SEG6_LOCAL_FLV_OPERATION])
+		return -EINVAL;
+
+	fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]);
+	if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) {
+		NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)");
+		return -EOPNOTSUPP;
+	}
+
+	finfo->flv_ops = fops;
+
+	if (seg6_next_csid_enabled(fops)) {
+		/* Locator-Block and Locator-Node Function lengths can be
+		 * provided by the user space. Otherwise, default values are
+		 * applied.
+		 */
+		rc = seg6_parse_nla_next_csid_cfg(tb, finfo, extack);
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int seg6_fill_nla_next_csid_cfg(struct sk_buff *skb,
+				       struct seg6_flavors_info *finfo)
+{
+	if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCBLOCK_BITS, finfo->lcblock_bits))
+		return -EMSGSIZE;
+
+	if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCNODE_FN_BITS,
+		       finfo->lcnode_func_bits))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int put_nla_flavors(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct seg6_flavors_info *finfo = &slwt->flv_info;
+	__u32 fops = finfo->flv_ops;
+	struct nlattr *nest;
+	int rc;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_FLAVORS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_FLV_OPERATION, fops)) {
+		rc = -EMSGSIZE;
+		goto err;
+	}
+
+	if (seg6_next_csid_enabled(fops)) {
+		rc = seg6_fill_nla_next_csid_cfg(skb, finfo);
+		if (rc < 0)
+			goto err;
+	}
+
+	return nla_nest_end(skb, nest);
+
+err:
+	nla_nest_cancel(skb, nest);
+	return rc;
+}
+
+static int seg6_cmp_nla_next_csid_cfg(struct seg6_flavors_info *finfo_a,
+				      struct seg6_flavors_info *finfo_b)
+{
+	if (finfo_a->lcblock_bits != finfo_b->lcblock_bits)
+		return 1;
+
+	if (finfo_a->lcnode_func_bits != finfo_b->lcnode_func_bits)
+		return 1;
+
+	return 0;
+}
+
+static int cmp_nla_flavors(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	struct seg6_flavors_info *finfo_a = &a->flv_info;
+	struct seg6_flavors_info *finfo_b = &b->flv_info;
+
+	if (finfo_a->flv_ops != finfo_b->flv_ops)
+		return 1;
+
+	if (seg6_next_csid_enabled(finfo_a->flv_ops)) {
+		if (seg6_cmp_nla_next_csid_cfg(finfo_a, finfo_b))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int encap_size_flavors(struct seg6_local_lwt *slwt)
+{
+	struct seg6_flavors_info *finfo = &slwt->flv_info;
+	int nlsize;
+
+	nlsize = nla_total_size(0) +	/* nest SEG6_LOCAL_FLAVORS */
+		 nla_total_size(4);	/* SEG6_LOCAL_FLV_OPERATION */
+
+	if (seg6_next_csid_enabled(finfo->flv_ops))
+		nlsize += nla_total_size(1) + /* SEG6_LOCAL_FLV_LCBLOCK_BITS */
+			  nla_total_size(1); /* SEG6_LOCAL_FLV_LCNODE_FN_BITS */
+
+	return nlsize;
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt,
 		     struct netlink_ext_ack *extack);
@@ -1603,6 +1916,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 				    .put = put_nla_counters,
 				    .cmp = cmp_nla_counters,
 				    .destroy = destroy_attr_counters },
+
+	[SEG6_LOCAL_FLAVORS]	= { .parse = parse_nla_flavors,
+				    .put = put_nla_flavors,
+				    .cmp = cmp_nla_flavors },
 };
 
 /* call the destroy() callback (if available) for each set attribute in
@@ -1916,6 +2233,9 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 			  /* SEG6_LOCAL_CNT_ERRORS */
 			  nla_total_size_64bit(sizeof(__u64));
 
+	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_FLAVORS))
+		nlsize += encap_size_flavors(slwt);
+
 	return nlsize;
 }
 
@@ -1971,6 +2291,15 @@ int __init seg6_local_init(void)
 	 */
 	BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long));
 
+	/* If the default NEXT-C-SID Locator-Block/Node Function lengths (in
+	 * bits) have been changed with invalid values, kernel build stops
+	 * here.
+	 */
+	BUILD_BUG_ON(next_csid_chk_cntr_bits(SEG6_LOCAL_LCBLOCK_DBITS,
+					     SEG6_LOCAL_LCNODE_FN_DBITS));
+	BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS));
+	BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS));
+
 	return lwtunnel_encap_add_ops(&seg6_local_ops,
 				      LWTUNNEL_ENCAP_SEG6_LOCAL);
 }
-- 
cgit v1.2.3


From ead77b65aef430d3bfe63524c243a60a29eb8d90 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 2 Sep 2022 15:29:24 +0200
Subject: HID: export hid_report_type to uapi

When we are dealing with eBPF, we need to have access to the report type.
Currently our implementation differs from the USB standard, making it
impossible for users to know the exact value besides hardcoding it
themselves.

And instead of a blank define, convert it as an enum.

Note that we need to also do change in the ll_driver API, but given
that this will have a wider impact outside of this tree, we leave this
as a TODO for the future.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220902132938.2409206-10-benjamin.tissoires@redhat.com
---
 drivers/hid/hid-core.c   | 13 +++++++------
 include/linux/hid.h      | 24 ++++++++----------------
 include/uapi/linux/hid.h | 12 ++++++++++++
 3 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index a00dd43db8bf..ab98754522d9 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -55,7 +55,7 @@ MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle
  */
 
 struct hid_report *hid_register_report(struct hid_device *device,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int application)
 {
 	struct hid_report_enum *report_enum = device->report_enum + type;
@@ -967,7 +967,7 @@ static const char * const hid_report_names[] = {
  * parsing.
  */
 struct hid_report *hid_validate_values(struct hid_device *hid,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int field_index,
 				       unsigned int report_counts)
 {
@@ -1954,8 +1954,8 @@ out:
 }
 EXPORT_SYMBOL_GPL(__hid_request);
 
-int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size,
-		int interrupt)
+int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+			 int interrupt)
 {
 	struct hid_report_enum *report_enum = hid->report_enum + type;
 	struct hid_report *report;
@@ -2019,7 +2019,8 @@ EXPORT_SYMBOL_GPL(hid_report_raw_event);
  *
  * This is data entry for lower layers.
  */
-int hid_input_report(struct hid_device *hid, int type, u8 *data, u32 size, int interrupt)
+int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+		     int interrupt)
 {
 	struct hid_report_enum *report_enum;
 	struct hid_driver *hdrv;
@@ -2377,7 +2378,7 @@ EXPORT_SYMBOL_GPL(hid_hw_request);
  */
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, unsigned char rtype, int reqtype)
+		       size_t len, enum hid_report_type rtype, int reqtype)
 {
 	if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf)
 		return -EINVAL;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index a43dd17bc78f..b1a33dbbc78e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -314,15 +314,6 @@ struct hid_item {
 #define HID_BAT_ABSOLUTESTATEOFCHARGE	0x00850065
 
 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS	0xff310076
-/*
- * HID report types --- Ouch! HID spec says 1 2 3!
- */
-
-#define HID_INPUT_REPORT	0
-#define HID_OUTPUT_REPORT	1
-#define HID_FEATURE_REPORT	2
-
-#define HID_REPORT_TYPES	3
 
 /*
  * HID connect requests
@@ -509,7 +500,7 @@ struct hid_report {
 	struct list_head hidinput_list;
 	struct list_head field_entry_list;		/* ordered list of input fields */
 	unsigned int id;				/* id of this report */
-	unsigned int type;				/* report type */
+	enum hid_report_type type;			/* report type */
 	unsigned int application;			/* application usage for this report */
 	struct hid_field *field[HID_MAX_FIELDS];	/* fields of the report */
 	struct hid_field_entry *field_entries;		/* allocated memory of input field_entry */
@@ -926,7 +917,8 @@ extern int hidinput_connect(struct hid_device *hid, unsigned int force);
 extern void hidinput_disconnect(struct hid_device *);
 
 int hid_set_field(struct hid_field *, unsigned, __s32);
-int hid_input_report(struct hid_device *, int type, u8 *, u32, int);
+int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+		     int interrupt);
 struct hid_field *hidinput_get_led_field(struct hid_device *hid);
 unsigned int hidinput_count_leds(struct hid_device *hid);
 __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code);
@@ -935,11 +927,11 @@ int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype);
 u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags);
 struct hid_device *hid_allocate_device(void);
 struct hid_report *hid_register_report(struct hid_device *device,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int application);
 int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
 struct hid_report *hid_validate_values(struct hid_device *hid,
-				       unsigned int type, unsigned int id,
+				       enum hid_report_type type, unsigned int id,
 				       unsigned int field_index,
 				       unsigned int report_counts);
 
@@ -1111,7 +1103,7 @@ void hid_hw_request(struct hid_device *hdev,
 		    struct hid_report *report, int reqtype);
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, unsigned char rtype, int reqtype);
+		       size_t len, enum hid_report_type rtype, int reqtype);
 int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len);
 
 /**
@@ -1184,8 +1176,8 @@ static inline u32 hid_report_len(struct hid_report *report)
 	return DIV_ROUND_UP(report->size, 8) + (report->id > 0);
 }
 
-int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size,
-		int interrupt);
+int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
+			 int interrupt);
 
 /* HID quirks API */
 unsigned long hid_lookup_quirk(const struct hid_device *hdev);
diff --git a/include/uapi/linux/hid.h b/include/uapi/linux/hid.h
index b34492a87a8a..b25b0bacaff2 100644
--- a/include/uapi/linux/hid.h
+++ b/include/uapi/linux/hid.h
@@ -42,6 +42,18 @@
 #define USB_INTERFACE_PROTOCOL_KEYBOARD	1
 #define USB_INTERFACE_PROTOCOL_MOUSE	2
 
+/*
+ * HID report types --- Ouch! HID spec says 1 2 3!
+ */
+
+enum hid_report_type {
+	HID_INPUT_REPORT		= 0,
+	HID_OUTPUT_REPORT		= 1,
+	HID_FEATURE_REPORT		= 2,
+
+	HID_REPORT_TYPES,
+};
+
 /*
  * HID class requests
  */
-- 
cgit v1.2.3


From 735e1bb1b8067e209941a6bdfde23214696ff47e Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 2 Sep 2022 15:29:25 +0200
Subject: HID: convert defines of HID class requests into a proper enum

This allows to export the type in BTF and so in the automatically
generated vmlinux.h. It will also add some static checks on the users
when we change the ll driver API (see not below).

Note that we need to also do change in the ll_driver API, but given
that this will have a wider impact outside of this tree, we leave this
as a TODO for the future.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220902132938.2409206-11-benjamin.tissoires@redhat.com
---
 drivers/hid/hid-core.c   |  6 +++---
 include/linux/hid.h      |  9 +++++----
 include/uapi/linux/hid.h | 14 ++++++++------
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index ab98754522d9..aff37d6f587c 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1921,7 +1921,7 @@ static struct hid_report *hid_get_report(struct hid_report_enum *report_enum,
  * DO NOT USE in hid drivers directly, but through hid_hw_request instead.
  */
 int __hid_request(struct hid_device *hid, struct hid_report *report,
-		int reqtype)
+		enum hid_class_request reqtype)
 {
 	char *buf;
 	int ret;
@@ -2353,7 +2353,7 @@ EXPORT_SYMBOL_GPL(hid_hw_close);
  * @reqtype: hid request type
  */
 void hid_hw_request(struct hid_device *hdev,
-		    struct hid_report *report, int reqtype)
+		    struct hid_report *report, enum hid_class_request reqtype)
 {
 	if (hdev->ll_driver->request)
 		return hdev->ll_driver->request(hdev, report, reqtype);
@@ -2378,7 +2378,7 @@ EXPORT_SYMBOL_GPL(hid_hw_request);
  */
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, enum hid_report_type rtype, int reqtype)
+		       size_t len, enum hid_report_type rtype, enum hid_class_request reqtype)
 {
 	if (len < 1 || len > HID_MAX_BUFFER_SIZE || !buf)
 		return -EINVAL;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index b1a33dbbc78e..8677ae38599e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -923,7 +923,7 @@ struct hid_field *hidinput_get_led_field(struct hid_device *hid);
 unsigned int hidinput_count_leds(struct hid_device *hid);
 __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code);
 void hid_output_report(struct hid_report *report, __u8 *data);
-int __hid_request(struct hid_device *hid, struct hid_report *rep, int reqtype);
+int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype);
 u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags);
 struct hid_device *hid_allocate_device(void);
 struct hid_report *hid_register_report(struct hid_device *device,
@@ -1100,10 +1100,11 @@ void hid_hw_stop(struct hid_device *hdev);
 int __must_check hid_hw_open(struct hid_device *hdev);
 void hid_hw_close(struct hid_device *hdev);
 void hid_hw_request(struct hid_device *hdev,
-		    struct hid_report *report, int reqtype);
+		    struct hid_report *report, enum hid_class_request reqtype);
 int hid_hw_raw_request(struct hid_device *hdev,
 		       unsigned char reportnum, __u8 *buf,
-		       size_t len, enum hid_report_type rtype, int reqtype);
+		       size_t len, enum hid_report_type rtype,
+		       enum hid_class_request reqtype);
 int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len);
 
 /**
@@ -1131,7 +1132,7 @@ static inline int hid_hw_power(struct hid_device *hdev, int level)
  * @reqtype: hid request type
  */
 static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle,
-		int reqtype)
+		enum hid_class_request reqtype)
 {
 	if (hdev->ll_driver->idle)
 		return hdev->ll_driver->idle(hdev, report, idle, reqtype);
diff --git a/include/uapi/linux/hid.h b/include/uapi/linux/hid.h
index b25b0bacaff2..a4dcb34386e3 100644
--- a/include/uapi/linux/hid.h
+++ b/include/uapi/linux/hid.h
@@ -58,12 +58,14 @@ enum hid_report_type {
  * HID class requests
  */
 
-#define HID_REQ_GET_REPORT		0x01
-#define HID_REQ_GET_IDLE		0x02
-#define HID_REQ_GET_PROTOCOL		0x03
-#define HID_REQ_SET_REPORT		0x09
-#define HID_REQ_SET_IDLE		0x0A
-#define HID_REQ_SET_PROTOCOL		0x0B
+enum hid_class_request {
+	HID_REQ_GET_REPORT		= 0x01,
+	HID_REQ_GET_IDLE		= 0x02,
+	HID_REQ_GET_PROTOCOL		= 0x03,
+	HID_REQ_SET_REPORT		= 0x09,
+	HID_REQ_SET_IDLE		= 0x0A,
+	HID_REQ_SET_PROTOCOL		= 0x0B,
+};
 
 /*
  * HID class descriptor types
-- 
cgit v1.2.3


From 7b5541a932c21f7e07f068a785afcc25986e4893 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 11 Sep 2022 14:03:09 +0200
Subject: headers: Remove some left-over license text in
 include/uapi/linux/netfilter/

When the SPDX-License-Identifier tag has been added, the corresponding
license text has not been removed.

Remove it now.

Also, in xt_connmark.h, move the copyright text at the top of the file
which is a much more common pattern.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/ipset/ip_set.h |  4 ----
 include/uapi/linux/netfilter/xt_AUDIT.h     |  4 ----
 include/uapi/linux/netfilter/xt_connmark.h  | 13 ++++---------
 include/uapi/linux/netfilter/xt_osf.h       | 14 --------------
 4 files changed, 4 insertions(+), 31 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 6397d75899bc..79e5d68b87af 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -3,10 +3,6 @@
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
  * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #ifndef _UAPI_IP_SET_H
 #define _UAPI_IP_SET_H
diff --git a/include/uapi/linux/netfilter/xt_AUDIT.h b/include/uapi/linux/netfilter/xt_AUDIT.h
index 1b314e2f84ac..56a3f6092e0c 100644
--- a/include/uapi/linux/netfilter/xt_AUDIT.h
+++ b/include/uapi/linux/netfilter/xt_AUDIT.h
@@ -4,10 +4,6 @@
  *
  * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
  * (C) 2010-2011 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef _XT_AUDIT_TARGET_H
diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h
index f01c19b83a2b..41b578ccd03b 100644
--- a/include/uapi/linux/netfilter/xt_connmark.h
+++ b/include/uapi/linux/netfilter/xt_connmark.h
@@ -1,18 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/* Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ */
+
 #ifndef _XT_CONNMARK_H
 #define _XT_CONNMARK_H
 
 #include <linux/types.h>
 
-/* Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
- * by Henrik Nordstrom <hno@marasystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
 enum {
 	XT_CONNMARK_SET = 0,
 	XT_CONNMARK_SAVE,
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index 6e466236ca4b..f1f097896bdf 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -1,20 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2003+ Evgeniy Polyakov <johnpol@2ka.mxt.ru>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _XT_OSF_H
-- 
cgit v1.2.3


From c0e0d6ba25f180ab76d3c18f8b360a119dffa634 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Tue, 30 Aug 2022 05:50:10 -0700
Subject: io_uring: add IORING_SETUP_DEFER_TASKRUN

Allow deferring async tasks until the user calls io_uring_enter(2) with
the IORING_ENTER_GETEVENTS flag. Enable this mode with a flag at
io_uring_setup time. This functionality requires that the later
io_uring_enter will be called from the same submission task, and therefore
restrict this flag to work only when IORING_SETUP_SINGLE_ISSUER is also
set.

Being able to hand pick when tasks are run prevents the problem where
there is current work to be done, however task work runs anyway.

For example, a common workload would obtain a batch of CQEs, and process
each one. Interrupting this to additional taskwork would add latency but
not gain anything. If instead task work is deferred to just before more
CQEs are obtained then no additional latency is added.

The way this is implemented is by trying to keep task work local to a
io_ring_ctx, rather than to the submission task. This is required, as the
application will want to wake up only a single io_ring_ctx at a time to
process work, and so the lists of work have to be kept separate.

This has some other benefits like not having to check the task continually
in handle_tw_list (and potentially unlocking/locking those), and reducing
locks in the submit & process completions path.

There are networking cases where using this option can reduce request
latency by 50%. For example a contrived example using [1] where the client
sends 2k data and receives the same data back while doing some system
calls (to trigger task work) shows this reduction. The reason ends up
being that if sending responses is delayed by processing task work, then
the client side sits idle. Whereas reordering the sends first means that
the client runs it's workload in parallel with the local task work.

[1]:
Using https://github.com/DylanZA/netbench/tree/defer_run
Client:
./netbench  --client_only 1 --control_port 10000 --host <host> --tx "epoll --threads 16 --per_thread 1 --size 2048 --resp 2048 --workload 1000"
Server:
./netbench  --server_only 1 --control_port 10000  --rx "io_uring --defer_taskrun 0 --workload 100"   --rx "io_uring  --defer_taskrun 1 --workload 100"

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220830125013.570060-5-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   2 +
 include/uapi/linux/io_uring.h  |   7 ++
 io_uring/cancel.c              |   2 +-
 io_uring/io_uring.c            | 147 ++++++++++++++++++++++++++++++++++++-----
 io_uring/io_uring.h            |  29 ++++++--
 io_uring/rsrc.c                |   2 +-
 6 files changed, 168 insertions(+), 21 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 677a25d44d7f..d56ff2185168 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -301,6 +301,8 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table;
 		bool			poll_multi_queue;
 
+		struct llist_head	work_llist;
+
 		struct list_head	io_buffers_comp;
 	} ____cacheline_aligned_in_smp;
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6b83177fd41d..972b179bc07a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -157,6 +157,13 @@ enum {
  */
 #define IORING_SETUP_SINGLE_ISSUER	(1U << 12)
 
+/*
+ * Defer running task work to get events.
+ * Rather than running bits of task work whenever the task transitions
+ * try to do it just before it is needed.
+ */
+#define IORING_SETUP_DEFER_TASKRUN	(1U << 13)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 5fc5d3e80fcb..2291a53cdabd 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -292,7 +292,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
 			break;
 
 		mutex_unlock(&ctx->uring_lock);
-		ret = io_run_task_work_sig();
+		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
 			mutex_lock(&ctx->uring_lock);
 			break;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index edf7381b0215..1f0df14c3062 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -142,7 +142,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_dismantle_req(struct io_kiocb *req);
 static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
-
+static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
 static struct kmem_cache *req_cachep;
@@ -316,6 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
+	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_WQ_LIST(&ctx->locked_free_list);
@@ -1047,12 +1048,36 @@ void tctx_task_work(struct callback_head *cb)
 	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_local_work_add(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
+		return;
+
+	if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
+		io_move_task_work_from_local(ctx);
+		return;
+	}
+
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+	io_cqring_wake(ctx);
+
+}
+
+static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct llist_node *node;
 
+	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		io_req_local_work_add(req);
+		return;
+	}
+
 	/* task_work already pending, we're done */
 	if (!llist_add(&req->io_task_work.node, &tctx->task_list))
 		return;
@@ -1074,6 +1099,73 @@ void io_req_task_work_add(struct io_kiocb *req)
 	}
 }
 
+void io_req_task_work_add(struct io_kiocb *req)
+{
+	__io_req_task_work_add(req, true);
+}
+
+static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&ctx->work_llist);
+	while (node) {
+		struct io_kiocb *req = container_of(node, struct io_kiocb,
+						    io_task_work.node);
+
+		node = node->next;
+		__io_req_task_work_add(req, false);
+	}
+}
+
+int io_run_local_work(struct io_ring_ctx *ctx)
+{
+	bool locked;
+	struct llist_node *node;
+	struct llist_node fake;
+	struct llist_node *current_final = NULL;
+	int ret;
+
+	if (unlikely(ctx->submitter_task != current)) {
+		/* maybe this is before any submissions */
+		if (!ctx->submitter_task)
+			return 0;
+
+		return -EEXIST;
+	}
+
+	locked = mutex_trylock(&ctx->uring_lock);
+
+	node = io_llist_xchg(&ctx->work_llist, &fake);
+	ret = 0;
+again:
+	while (node != current_final) {
+		struct llist_node *next = node->next;
+		struct io_kiocb *req = container_of(node, struct io_kiocb,
+						    io_task_work.node);
+		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+		req->io_task_work.func(req, &locked);
+		ret++;
+		node = next;
+	}
+
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+	node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL);
+	if (node != &fake) {
+		current_final = &fake;
+		node = io_llist_xchg(&ctx->work_llist, &fake);
+		goto again;
+	}
+
+	if (locked) {
+		io_submit_flush_completions(ctx);
+		mutex_unlock(&ctx->uring_lock);
+	}
+	return ret;
+}
+
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
 	io_req_complete_post(req);
@@ -1285,8 +1377,10 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 			u32 tail = ctx->cached_cq_tail;
 
 			mutex_unlock(&ctx->uring_lock);
-			io_run_task_work();
+			ret = io_run_task_work_ctx(ctx);
 			mutex_lock(&ctx->uring_lock);
+			if (ret < 0)
+				break;
 
 			/* some requests don't go through iopoll_list */
 			if (tail != ctx->cached_cq_tail ||
@@ -2148,7 +2242,9 @@ struct io_wait_queue {
 
 static inline bool io_has_work(struct io_ring_ctx *ctx)
 {
-	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
+	       ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
+		!llist_empty(&ctx->work_llist));
 }
 
 static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -2180,9 +2276,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	return -1;
 }
 
-int io_run_task_work_sig(void)
+int io_run_task_work_sig(struct io_ring_ctx *ctx)
 {
-	if (io_run_task_work())
+	if (io_run_task_work_ctx(ctx) > 0)
 		return 1;
 	if (task_sigpending(current))
 		return -EINTR;
@@ -2198,7 +2294,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	unsigned long check_cq;
 
 	/* make sure we run task_work before checking for signals */
-	ret = io_run_task_work_sig();
+	ret = io_run_task_work_sig(ctx);
 	if (ret || io_should_wake(iowq))
 		return ret;
 
@@ -2229,12 +2325,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	int ret;
 
 	do {
+		/* always run at least 1 task work to process local work */
+		ret = io_run_task_work_ctx(ctx);
+		if (ret < 0)
+			return ret;
 		io_cqring_overflow_flush(ctx);
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
-		if (!io_run_task_work())
-			break;
-	} while (1);
+	} while (ret > 0);
 
 	if (sig) {
 #ifdef CONFIG_COMPAT
@@ -2575,6 +2673,9 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
+		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+			io_move_task_work_from_local(ctx);
+
 		while (io_uring_try_cancel_requests(ctx, NULL, true))
 			cond_resched();
 
@@ -2769,13 +2870,15 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		}
 	}
 
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		ret |= io_run_local_work(ctx) > 0;
 	ret |= io_cancel_defer_files(ctx, task, cancel_all);
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
 	if (task)
-		ret |= io_run_task_work();
+		ret |= io_run_task_work() > 0;
 	return ret;
 }
 
@@ -3060,8 +3163,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto iopoll_locked;
 		mutex_unlock(&ctx->uring_lock);
 	}
+
 	if (flags & IORING_ENTER_GETEVENTS) {
 		int ret2;
+
 		if (ctx->syscall_iopoll) {
 			/*
 			 * We disallow the app entering submit/complete with
@@ -3290,17 +3395,29 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		/* IPI related flags don't make sense with SQPOLL */
 		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
-				  IORING_SETUP_TASKRUN_FLAG))
+				  IORING_SETUP_TASKRUN_FLAG |
+				  IORING_SETUP_DEFER_TASKRUN))
 			goto err;
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
 		ctx->notify_method = TWA_SIGNAL_NO_IPI;
 	} else {
-		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
+		    !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
 			goto err;
 		ctx->notify_method = TWA_SIGNAL;
 	}
 
+	/*
+	 * For DEFER_TASKRUN we require the completion task to be the same as the
+	 * submission task. This implies that there is only one submitter, so enforce
+	 * that.
+	 */
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
+	    !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
+		goto err;
+	}
+
 	/*
 	 * This is just grabbed for accounting purposes. When a process exits,
 	 * the mm is exited and dropped before the files, hence we need to hang
@@ -3401,7 +3518,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
-			IORING_SETUP_SINGLE_ISSUER))
+			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
@@ -3864,7 +3981,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
 	ctx = f.file->private_data;
 
-	io_run_task_work();
+	io_run_task_work_ctx(ctx);
 
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2f73f83af960..f417d75d7bc1 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -26,7 +26,8 @@ enum {
 
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
 bool io_req_cqe_overflow(struct io_kiocb *req);
-int io_run_task_work_sig(void);
+int io_run_task_work_sig(struct io_ring_ctx *ctx);
+int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
@@ -221,17 +222,37 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
-static inline bool io_run_task_work(void)
+static inline int io_run_task_work(void)
 {
 	if (test_thread_flag(TIF_NOTIFY_SIGNAL)) {
 		__set_current_state(TASK_RUNNING);
 		clear_notify_signal();
 		if (task_work_pending(current))
 			task_work_run();
-		return true;
+		return 1;
 	}
 
-	return false;
+	return 0;
+}
+
+static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
+{
+	int ret = 0;
+	int ret2;
+
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		ret = io_run_local_work(ctx);
+
+	/* want to run this after in case more is added */
+	ret2 = io_run_task_work();
+
+	/* Try propagate error in favour of if tasks were run,
+	 * but still make sure to run them if requested
+	 */
+	if (ret >= 0)
+		ret += ret2;
+
+	return ret;
 }
 
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index cf3272113214..6f88ded0e7e5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -341,7 +341,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		flush_delayed_work(&ctx->rsrc_put_work);
 		reinit_completion(&data->done);
 
-		ret = io_run_task_work_sig();
+		ret = io_run_task_work_sig(ctx);
 		mutex_lock(&ctx->uring_lock);
 	} while (ret >= 0);
 	data->quiesce = false;
-- 
cgit v1.2.3


From bcc57a48eaee63a71983996c4c9d89ce7cbf55d9 Mon Sep 17 00:00:00 2001
From: Andrea Merello <andrea.merello@iit.it>
Date: Wed, 7 Sep 2022 15:21:52 +0200
Subject: iio: add modifiers for linear acceleration

Add IIO_MOD_LINEAR_X, IIO_MOD_LINEAR_Y and IIO_MOD_LINEAR_Z modifiers to te
IIO core, which is preparatory for adding the Bosch BNO055 IMU driver.

Bosch BNO055 IMU can report raw accelerations (among x, y and z axis) as
well as the so called "linear accelerations" (again, among x, y and z axis)
which is basically the acceleration after subtracting gravity and for which
those new modifiers are for.

Signed-off-by: Andrea Merello <andrea.merello@iit.it>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220907132205.28021-2-andrea.merello@iit.it
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 3 +++
 include/uapi/linux/iio/types.h  | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 583e0e5205a0..6f7b6fd0c6ef 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -134,6 +134,9 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_ETHANOL] = "ethanol",
 	[IIO_MOD_H2] = "h2",
 	[IIO_MOD_O2] = "o2",
+	[IIO_MOD_LINEAR_X] = "linear_x",
+	[IIO_MOD_LINEAR_Y] = "linear_y",
+	[IIO_MOD_LINEAR_Z] = "linear_z",
 };
 
 /* relies on pairs of these shared then separate */
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 913864221ac4..b7ba9861a24d 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -95,6 +95,9 @@ enum iio_modifier {
 	IIO_MOD_ETHANOL,
 	IIO_MOD_H2,
 	IIO_MOD_O2,
+	IIO_MOD_LINEAR_X,
+	IIO_MOD_LINEAR_Y,
+	IIO_MOD_LINEAR_Z,
 };
 
 enum iio_event_type {
@@ -118,4 +121,3 @@ enum iio_event_direction {
 };
 
 #endif /* _UAPI_IIO_TYPES_H_ */
-
-- 
cgit v1.2.3


From dcedf14553810cd6bbf7227c995beb4548e0859d Mon Sep 17 00:00:00 2001
From: Andrea Merello <andrea.merello@iit.it>
Date: Wed, 7 Sep 2022 15:21:55 +0200
Subject: iio: add modifers for pitch, yaw, roll

Add modifiers for reporting rotations as euler angles (i.e. yaw, pitch and
roll).

Signed-off-by: Andrea Merello <andrea.merello@iit.it>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20220907132205.28021-5-andrea.merello@iit.it
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 3 +++
 include/uapi/linux/iio/types.h  | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 6f7b6fd0c6ef..8f12993f87be 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -137,6 +137,9 @@ static const char * const iio_modifier_names[] = {
 	[IIO_MOD_LINEAR_X] = "linear_x",
 	[IIO_MOD_LINEAR_Y] = "linear_y",
 	[IIO_MOD_LINEAR_Z] = "linear_z",
+	[IIO_MOD_PITCH] = "pitch",
+	[IIO_MOD_YAW] = "yaw",
+	[IIO_MOD_ROLL] = "roll",
 };
 
 /* relies on pairs of these shared then separate */
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index b7ba9861a24d..c79f2f046a0b 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -98,6 +98,9 @@ enum iio_modifier {
 	IIO_MOD_LINEAR_X,
 	IIO_MOD_LINEAR_Y,
 	IIO_MOD_LINEAR_Z,
+	IIO_MOD_PITCH,
+	IIO_MOD_YAW,
+	IIO_MOD_ROLL,
 };
 
 enum iio_event_type {
-- 
cgit v1.2.3


From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 21 Sep 2022 10:46:02 +0000
Subject: bpf, cgroup: Reject prog_attach_flags array when effective query

Attach flags is only valid for attached progs of this layer cgroup,
but not for effective progs. For querying with EFFECTIVE flags,
exporting attach flags does not make sense. So when effective query,
we reject prog_attach_flags array and don't need to populate it.
Also we limit attach_flags to output 0 during effective query.

Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  7 +++++--
 kernel/bpf/cgroup.c            | 28 ++++++++++++++++++----------
 tools/include/uapi/linux/bpf.h |  7 +++++--
 3 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59a217ca2dfd..4eff7fc7ae58 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1233,7 +1233,7 @@ enum {
 
 /* Query effective (directly attached + inherited from ancestor cgroups)
  * programs that will be executed for events within a cgroup.
- * attach_flags with this flag are returned only for directly attached programs.
+ * attach_flags with this flag are always returned 0.
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
@@ -1432,7 +1432,10 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
-		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
+		/* output: per-program attach_flags.
+		 * not allowed to be set during effective query.
+		 */
+		__aligned_u64	prog_attach_flags;
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4a400cd63731..22888aaa68b6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			      union bpf_attr __user *uattr)
 {
 	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
 	enum cgroup_bpf_attach_type from_atype, to_atype;
@@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	int total_cnt = 0;
 	u32 flags;
 
+	if (effective_query && prog_attach_flags)
+		return -EINVAL;
+
 	if (type == BPF_LSM_CGROUP) {
-		if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+		if (!effective_query && attr->query.prog_cnt &&
+		    prog_ids && !prog_attach_flags)
 			return -EINVAL;
 
 		from_atype = CGROUP_LSM_START;
@@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	for (atype = from_atype; atype <= to_atype; atype++) {
-		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
 			total_cnt += bpf_prog_array_length(effective);
@@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		}
 	}
 
+	/* always output uattr->query.attach_flags as 0 during effective query */
+	flags = effective_query ? 0 : flags;
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 		return -EFAULT;
 	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
@@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
-		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
 			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
@@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 				if (++i == cnt)
 					break;
 			}
-		}
 
-		if (prog_attach_flags) {
-			flags = cgrp->bpf.flags[atype];
+			if (prog_attach_flags) {
+				flags = cgrp->bpf.flags[atype];
 
-			for (i = 0; i < cnt; i++)
-				if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
-					return -EFAULT;
-			prog_attach_flags += cnt;
+				for (i = 0; i < cnt; i++)
+					if (copy_to_user(prog_attach_flags + i,
+							 &flags, sizeof(flags)))
+						return -EFAULT;
+				prog_attach_flags += cnt;
+			}
 		}
 
 		prog_ids += cnt;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 59a217ca2dfd..4eff7fc7ae58 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1233,7 +1233,7 @@ enum {
 
 /* Query effective (directly attached + inherited from ancestor cgroups)
  * programs that will be executed for events within a cgroup.
- * attach_flags with this flag are returned only for directly attached programs.
+ * attach_flags with this flag are always returned 0.
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
@@ -1432,7 +1432,10 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
-		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
+		/* output: per-program attach_flags.
+		 * not allowed to be set during effective query.
+		 */
+		__aligned_u64	prog_attach_flags;
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
-- 
cgit v1.2.3


From 493108d95f1464ccd101d4e5cfa7e93f1fc64d47 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 21 Sep 2022 12:17:54 +0100
Subject: io_uring/net: zerocopy sendmsg

Add a zerocopy version of sendmsg.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6aabc4bdfc0ec78df6ec9328137e394af9d4e7ef.1663668091.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/net.c                | 91 ++++++++++++++++++++++++++++++++++++++++---
 io_uring/net.h                |  1 +
 io_uring/opdef.c              | 19 +++++++++
 4 files changed, 107 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 972b179bc07a..92f29d9505a6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -213,6 +213,7 @@ enum io_uring_op {
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
+	IORING_OP_SENDMSG_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 209bc69b3707..757a300578f4 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -909,7 +909,12 @@ out_free:
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
+	struct io_async_msghdr *io;
 
+	if (req_has_async_data(req)) {
+		io = req->async_data;
+		kfree(io->free_iov);
+	}
 	zc->notif->flags |= REQ_F_CQE_SKIP;
 	io_notif_flush(zc->notif);
 	zc->notif = NULL;
@@ -921,8 +926,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *notif;
 
-	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
-	    READ_ONCE(sqe->__pad3[0]))
+	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
 		return -EINVAL;
 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
 	if (req->flags & REQ_F_CQE_SKIP)
@@ -949,14 +953,24 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		io_req_set_rsrc_node(notif, ctx, 0);
 	}
 
+	if (req->opcode == IORING_OP_SEND_ZC) {
+		if (READ_ONCE(sqe->__pad3[0]))
+			return -EINVAL;
+		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+		zc->addr_len = READ_ONCE(sqe->addr_len);
+	} else {
+		if (unlikely(sqe->addr2 || sqe->file_index))
+			return -EINVAL;
+		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
+			return -EINVAL;
+	}
+
 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	zc->len = READ_ONCE(sqe->len);
 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (zc->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 
-	zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-	zc->addr_len = READ_ONCE(sqe->addr_len);
 	zc->done_io = 0;
 
 #ifdef CONFIG_COMPAT
@@ -1118,6 +1132,73 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
+int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	struct io_async_msghdr iomsg, *kmsg;
+	struct socket *sock;
+	unsigned flags, cflags;
+	int ret, min_ret = 0;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	if (req_has_async_data(req)) {
+		kmsg = req->async_data;
+	} else {
+		ret = io_sendmsg_copy_hdr(req, &iomsg);
+		if (ret)
+			return ret;
+		kmsg = &iomsg;
+	}
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
+		return io_setup_async_msg(req, kmsg, issue_flags);
+
+	flags = sr->msg_flags | MSG_ZEROCOPY;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
+	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+
+	if (unlikely(ret < min_ret)) {
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+			return io_setup_async_msg(req, kmsg, issue_flags);
+
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return io_setup_async_msg(req, kmsg, issue_flags);
+		}
+		if (ret < 0 && !sr->done_io)
+			sr->notif->flags |= REQ_F_CQE_SKIP;
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		req_set_fail(req);
+	}
+	/* fast path, check for non-NULL to avoid function call */
+	if (kmsg->free_iov)
+		kfree(kmsg->free_iov);
+
+	io_netmsg_recycle(req, issue_flags);
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+
+	io_notif_flush(sr->notif);
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
+	io_req_set_res(req, ret, cflags);
+	return IOU_OK;
+}
+
 void io_sendrecv_fail(struct io_kiocb *req)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -1127,7 +1208,7 @@ void io_sendrecv_fail(struct io_kiocb *req)
 	if (req->flags & REQ_F_PARTIAL_IO)
 		res = sr->done_io;
 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
-	    req->opcode == IORING_OP_SEND_ZC) {
+	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) {
 		/* preserve notification for partial I/O */
 		if (res < 0)
 			sr->notif->flags |= REQ_F_CQE_SKIP;
diff --git a/io_uring/net.h b/io_uring/net.h
index 45558e2b0a83..5ffa11bf5d2e 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -57,6 +57,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 void io_send_zc_cleanup(struct io_kiocb *req);
 
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 0fdeb1bc21de..2330f6da791e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -503,6 +503,25 @@ const struct io_op_def io_op_defs[] = {
 		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_SENDMSG_ZC] = {
+		.name			= "SENDMSG_ZC",
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.manual_alloc		= 1,
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep			= io_send_zc_prep,
+		.issue			= io_sendmsg_zc,
+		.prep_async		= io_sendmsg_prep_async,
+		.cleanup		= io_send_zc_cleanup,
+		.fail			= io_sendrecv_fail,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
-- 
cgit v1.2.3


From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:57 -0500
Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type

We want to support a ringbuf map type where samples are published from
user-space, to be consumed by BPF programs. BPF currently supports a
kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF
map type.  We'll need to define a new map type for user-space -> kernel,
as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply
to a user-space producer ring buffer, and we'll want to add one or
more helper functions that would not apply for a kernel-producer
ring buffer.

This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type
definition. The map type is useless in its current form, as there is no
way to access or use it for anything until we one or more BPF helpers. A
follow-on patch will therefore add a new helper function that allows BPF
programs to run callbacks on samples that are published to the ring
buffer.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com
---
 include/linux/bpf_types.h                       |  1 +
 include/uapi/linux/bpf.h                        |  1 +
 kernel/bpf/ringbuf.c                            | 62 ++++++++++++++++++++++---
 kernel/bpf/verifier.c                           |  3 ++
 tools/bpf/bpftool/Documentation/bpftool-map.rst |  2 +-
 tools/bpf/bpftool/map.c                         |  2 +-
 tools/include/uapi/linux/bpf.h                  |  1 +
 tools/lib/bpf/libbpf.c                          |  1 +
 8 files changed, 65 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b9112b80171..2c6a4f2562a7 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3df78c56c1bf..e18c85324db6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -928,6 +928,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
+	BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index b483aea35f41..754e915748fb 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,10 +38,27 @@ struct bpf_ringbuf {
 	struct page **pages;
 	int nr_pages;
 	spinlock_t spinlock ____cacheline_aligned_in_smp;
-	/* Consumer and producer counters are put into separate pages to allow
-	 * mapping consumer page as r/w, but restrict producer page to r/o.
-	 * This protects producer position from being modified by user-space
-	 * application and ruining in-kernel position tracking.
+	/* Consumer and producer counters are put into separate pages to
+	 * allow each position to be mapped with different permissions.
+	 * This prevents a user-space application from modifying the
+	 * position and ruining in-kernel tracking. The permissions of the
+	 * pages depend on who is producing samples: user-space or the
+	 * kernel.
+	 *
+	 * Kernel-producer
+	 * ---------------
+	 * The producer position and data pages are mapped as r/o in
+	 * userspace. For this approach, bits in the header of samples are
+	 * used to signal to user-space, and to other producers, whether a
+	 * sample is currently being written.
+	 *
+	 * User-space producer
+	 * -------------------
+	 * Only the page containing the consumer position is mapped r/o in
+	 * user-space. User-space producers also use bits of the header to
+	 * communicate to the kernel, but the kernel must carefully check and
+	 * validate each sample to ensure that they're correctly formatted, and
+	 * fully contained within the ring buffer.
 	 */
 	unsigned long consumer_pos __aligned(PAGE_SIZE);
 	unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
 {
 	struct bpf_ringbuf_map *rb_map;
 
@@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 				   vma->vm_pgoff + RINGBUF_PGOFF);
 }
 
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	struct bpf_ringbuf_map *rb_map;
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+	if (vma->vm_flags & VM_WRITE) {
+		if (vma->vm_pgoff == 0)
+			/* Disallow writable mappings to the consumer pointer,
+			 * and allow writable mappings to both the producer
+			 * position, and the ring buffer data itself.
+			 */
+			return -EPERM;
+	} else {
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+	/* remap_vmalloc_range() checks size and offset constraints */
+	return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 {
 	unsigned long cons_pos, prod_pos;
@@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
-	.map_mmap = ringbuf_map_mmap,
+	.map_mmap = ringbuf_map_mmap_kern,
 	.map_poll = ringbuf_map_poll,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
@@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_btf_id = &ringbuf_map_btf_ids[0],
 };
 
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc = ringbuf_map_alloc,
+	.map_free = ringbuf_map_free,
+	.map_mmap = ringbuf_map_mmap_user,
+	.map_lookup_elem = ringbuf_map_lookup_elem,
+	.map_update_elem = ringbuf_map_update_elem,
+	.map_delete_elem = ringbuf_map_delete_elem,
+	.map_get_next_key = ringbuf_map_get_next_key,
+	.map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
  * calculate offset from record metadata to ring buffer in pages, rounded
  * down. This page offset is stored as part of record metadata and allows to
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c6fbcd0afaf..83710b60e708 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_USER_RINGBUF:
+		goto error;
 	case BPF_MAP_TYPE_STACK_TRACE:
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
@@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 		case BPF_MAP_TYPE_HASH_OF_MAPS:
 		case BPF_MAP_TYPE_RINGBUF:
+		case BPF_MAP_TYPE_USER_RINGBUF:
 		case BPF_MAP_TYPE_INODE_STORAGE:
 		case BPF_MAP_TYPE_SK_STORAGE:
 		case BPF_MAP_TYPE_TASK_STORAGE:
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 7c188a598444..7f3b67a8b48f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -55,7 +55,7 @@ MAP COMMANDS
 |		| **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-|		| **task_storage** | **bloom_filter** }
+|		| **task_storage** | **bloom_filter** | **user_ringbuf** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 38b6bc9c26c3..9a6ca9f31133 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv)
 		"                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
 		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-		"                 task_storage | bloom_filter }\n"
+		"                 task_storage | bloom_filter | user_ringbuf }\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
 		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3df78c56c1bf..e18c85324db6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -928,6 +928,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
+	BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2ca30ccc774c..d480da05b6de 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -163,6 +163,7 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_INODE_STORAGE]		= "inode_storage",
 	[BPF_MAP_TYPE_TASK_STORAGE]		= "task_storage",
 	[BPF_MAP_TYPE_BLOOM_FILTER]		= "bloom_filter",
+	[BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
 };
 
 static const char * const prog_type_name[] = {
-- 
cgit v1.2.3


From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:58 -0500
Subject: bpf: Add bpf_user_ringbuf_drain() helper

In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which
will allow user-space applications to publish messages to a ring buffer
that is consumed by a BPF program in kernel-space. In order for this
map-type to be useful, it will require a BPF helper function that BPF
programs can invoke to drain samples from the ring buffer, and invoke
callbacks on those samples. This change adds that capability via a new BPF
helper function:

bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx,
                       u64 flags)

BPF programs may invoke this function to run callback_fn() on a series of
samples in the ring buffer. callback_fn() has the following signature:

long callback_fn(struct bpf_dynptr *dynptr, void *context);

Samples are provided to the callback in the form of struct bpf_dynptr *'s,
which the program can read using BPF helper functions for querying
struct bpf_dynptr's.

In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register
type is added to the verifier to reflect a dynptr that was allocated by
a helper function and passed to a BPF program. Unlike PTR_TO_STACK
dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR
dynptrs need not use reference tracking, as the BPF helper is trusted to
properly free the dynptr before returning. The verifier currently only
supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL.

Note that while the corresponding user-space libbpf logic will be added
in a subsequent patch, this patch does contain an implementation of the
.map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This
.map_poll() callback guarantees that an epoll-waiting user-space
producer will receive at least one event notification whenever at least
one sample is drained in an invocation of bpf_user_ringbuf_drain(),
provided that the function is not invoked with the BPF_RB_NO_WAKEUP
flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup
notification is sent even if no sample was drained.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com
---
 include/linux/bpf.h            |  11 ++-
 include/uapi/linux/bpf.h       |  38 +++++++++
 kernel/bpf/helpers.c           |   2 +
 kernel/bpf/ringbuf.c           | 181 +++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c          |  61 +++++++++++++-
 tools/include/uapi/linux/bpf.h |  38 +++++++++
 6 files changed, 320 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e0dbe0c0a17e..33e543b86e1a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -451,7 +451,7 @@ enum bpf_type_flag {
 	/* DYNPTR points to memory local to the bpf program. */
 	DYNPTR_TYPE_LOCAL	= BIT(8 + BPF_BASE_TYPE_BITS),
 
-	/* DYNPTR points to a ringbuf record. */
+	/* DYNPTR points to a kernel-produced ringbuf record. */
 	DYNPTR_TYPE_RINGBUF	= BIT(9 + BPF_BASE_TYPE_BITS),
 
 	/* Size is known at compile time. */
@@ -656,6 +656,7 @@ enum bpf_reg_type {
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_DYNPTR,		 /* reg points to a dynptr */
 	__BPF_REG_TYPE_MAX,
 
 	/* Extended reg_types. */
@@ -1394,6 +1395,11 @@ struct bpf_array {
 #define BPF_MAP_CAN_READ	BIT(0)
 #define BPF_MAP_CAN_WRITE	BIT(1)
 
+/* Maximum number of user-producer ring buffer samples that can be drained in
+ * a call to bpf_user_ringbuf_drain().
+ */
+#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)
+
 static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
 {
 	u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
@@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 extern const struct bpf_func_proto bpf_set_retval_proto;
 extern const struct bpf_func_proto bpf_get_retval_proto;
+extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2639,7 +2646,7 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_INVALID,
 	/* Points to memory that is local to the bpf program */
 	BPF_DYNPTR_TYPE_LOCAL,
-	/* Underlying data is a ringbuf record */
+	/* Underlying data is a kernel-produced ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e18c85324db6..ead35f39f185 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5388,6 +5388,43 @@ union bpf_attr {
  *	Return
  *		Current *ktime*.
  *
+ * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags)
+ *	Description
+ *		Drain samples from the specified user ring buffer, and invoke
+ *		the provided callback for each such sample:
+ *
+ *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *
+ *		If **callback_fn** returns 0, the helper will continue to try
+ *		and drain the next sample, up to a maximum of
+ *		BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1,
+ *		the helper will skip the rest of the samples and return. Other
+ *		return values are not used now, and will be rejected by the
+ *		verifier.
+ *	Return
+ *		The number of drained samples if no error was encountered while
+ *		draining samples, or 0 if no samples were present in the ring
+ *		buffer. If a user-space producer was epoll-waiting on this map,
+ *		and at least one sample was drained, they will receive an event
+ *		notification notifying them of available space in the ring
+ *		buffer. If the BPF_RB_NO_WAKEUP flag is passed to this
+ *		function, no wakeup notification will be sent. If the
+ *		BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will
+ *		be sent even if no sample was drained.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EBUSY** if the ring buffer is contended, and another calling
+ *		context was concurrently draining the ring buffer.
+ *
+ *		**-EINVAL** if user-space is not properly tracking the ring
+ *		buffer due to the producer position not being aligned to 8
+ *		bytes, a sample not being aligned to 8 bytes, or the producer
+ *		position not matching the advertised length of a sample.
+ *
+ *		**-E2BIG** if user-space has tried to publish a sample which is
+ *		larger than the size of the ring buffer, or which cannot fit
+ *		within a struct bpf_dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5599,6 +5636,7 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
 	FN(ktime_get_tai_ns),		\
+	FN(user_ringbuf_drain),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 41aeaf3862ec..cb5564c77482 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
 		return &bpf_loop_proto;
+	case BPF_FUNC_user_ringbuf_drain:
+		return &bpf_user_ringbuf_drain_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 754e915748fb..9e832acf4692 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,6 +38,22 @@ struct bpf_ringbuf {
 	struct page **pages;
 	int nr_pages;
 	spinlock_t spinlock ____cacheline_aligned_in_smp;
+	/* For user-space producer ring buffers, an atomic_t busy bit is used
+	 * to synchronize access to the ring buffers in the kernel, rather than
+	 * the spinlock that is used for kernel-producer ring buffers. This is
+	 * done because the ring buffer must hold a lock across a BPF program's
+	 * callback:
+	 *
+	 *    __bpf_user_ringbuf_peek() // lock acquired
+	 * -> program callback_fn()
+	 * -> __bpf_user_ringbuf_sample_release() // lock released
+	 *
+	 * It is unsafe and incorrect to hold an IRQ spinlock across what could
+	 * be a long execution window, so we instead simply disallow concurrent
+	 * access to the ring buffer by kernel consumers, and return -EBUSY from
+	 * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+	 */
+	atomic_t busy ____cacheline_aligned_in_smp;
 	/* Consumer and producer counters are put into separate pages to
 	 * allow each position to be mapped with different permissions.
 	 * This prevents a user-space application from modifying the
@@ -153,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
 		return NULL;
 
 	spin_lock_init(&rb->spinlock);
+	atomic_set(&rb->busy, 0);
 	init_waitqueue_head(&rb->waitq);
 	init_irq_work(&rb->work, bpf_ringbuf_notify);
 
@@ -288,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 	return prod_pos - cons_pos;
 }
 
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
-				 struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+	return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+				      struct poll_table_struct *pts)
 {
 	struct bpf_ringbuf_map *rb_map;
 
@@ -301,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
 	return 0;
 }
 
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+				      struct poll_table_struct *pts)
+{
+	struct bpf_ringbuf_map *rb_map;
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+	poll_wait(filp, &rb_map->rb->waitq, pts);
+
+	if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+		return EPOLLOUT | EPOLLWRNORM;
+	return 0;
+}
+
 BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
 const struct bpf_map_ops ringbuf_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
 	.map_mmap = ringbuf_map_mmap_kern,
-	.map_poll = ringbuf_map_poll,
+	.map_poll = ringbuf_map_poll_kern,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
@@ -321,6 +356,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = {
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
 	.map_mmap = ringbuf_map_mmap_user,
+	.map_poll = ringbuf_map_poll_user,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
@@ -362,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
 		return NULL;
 
 	len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
-	if (len > rb->mask + 1)
+	if (len > ringbuf_total_data_sz(rb))
 		return NULL;
 
 	cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -509,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
 	case BPF_RB_AVAIL_DATA:
 		return ringbuf_avail_data_sz(rb);
 	case BPF_RB_RING_SIZE:
-		return rb->mask + 1;
+		return ringbuf_total_data_sz(rb);
 	case BPF_RB_CONS_POS:
 		return smp_load_acquire(&rb->consumer_pos);
 	case BPF_RB_PROD_POS:
@@ -603,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
 	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+	int err;
+	u32 hdr_len, sample_len, total_len, flags, *hdr;
+	u64 cons_pos, prod_pos;
+
+	/* Synchronizes with smp_store_release() in user-space producer. */
+	prod_pos = smp_load_acquire(&rb->producer_pos);
+	if (prod_pos % 8)
+		return -EINVAL;
+
+	/* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+	cons_pos = smp_load_acquire(&rb->consumer_pos);
+	if (cons_pos >= prod_pos)
+		return -ENODATA;
+
+	hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+	/* Synchronizes with smp_store_release() in user-space producer. */
+	hdr_len = smp_load_acquire(hdr);
+	flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+	sample_len = hdr_len & ~flags;
+	total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+	/* The sample must fit within the region advertised by the producer position. */
+	if (total_len > prod_pos - cons_pos)
+		return -EINVAL;
+
+	/* The sample must fit within the data region of the ring buffer. */
+	if (total_len > ringbuf_total_data_sz(rb))
+		return -E2BIG;
+
+	/* The sample must fit into a struct bpf_dynptr. */
+	err = bpf_dynptr_check_size(sample_len);
+	if (err)
+		return -E2BIG;
+
+	if (flags & BPF_RINGBUF_DISCARD_BIT) {
+		/* If the discard bit is set, the sample should be skipped.
+		 *
+		 * Update the consumer pos, and return -EAGAIN so the caller
+		 * knows to skip this sample and try to read the next one.
+		 */
+		smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+		return -EAGAIN;
+	}
+
+	if (flags & BPF_RINGBUF_BUSY_BIT)
+		return -ENODATA;
+
+	*sample = (void *)((uintptr_t)rb->data +
+			   (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+	*size = sample_len;
+	return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+	u64 consumer_pos;
+	u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+	/* Using smp_load_acquire() is unnecessary here, as the busy-bit
+	 * prevents another task from writing to consumer_pos after it was read
+	 * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+	 */
+	consumer_pos = rb->consumer_pos;
+	 /* Synchronizes with smp_load_acquire() in user-space producer. */
+	smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+	   void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+	struct bpf_ringbuf *rb;
+	long samples, discarded_samples = 0, ret = 0;
+	bpf_callback_t callback = (bpf_callback_t)callback_fn;
+	u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+	int busy = 0;
+
+	if (unlikely(flags & ~wakeup_flags))
+		return -EINVAL;
+
+	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+	/* If another consumer is already consuming a sample, wait for them to finish. */
+	if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+		return -EBUSY;
+
+	for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+		int err;
+		u32 size;
+		void *sample;
+		struct bpf_dynptr_kern dynptr;
+
+		err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+		if (err) {
+			if (err == -ENODATA) {
+				break;
+			} else if (err == -EAGAIN) {
+				discarded_samples++;
+				continue;
+			} else {
+				ret = err;
+				goto schedule_work_return;
+			}
+		}
+
+		bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+		ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+		__bpf_user_ringbuf_sample_release(rb, size, flags);
+	}
+	ret = samples - discarded_samples;
+
+schedule_work_return:
+	/* Prevent the clearing of the busy-bit from being reordered before the
+	 * storing of any rb consumer or producer positions.
+	 */
+	smp_mb__before_atomic();
+	atomic_set(&rb->busy, 0);
+
+	if (flags & BPF_RB_FORCE_WAKEUP)
+		irq_work_queue(&rb->work);
+	else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+		irq_work_queue(&rb->work);
+	return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+	.func		= bpf_user_ringbuf_drain,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83710b60e708..c76fa45a5906 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
 		[PTR_TO_MAP_KEY]	= "map_key",
+		[PTR_TO_DYNPTR]		= "dynptr_ptr",
 	};
 
 	if (type & PTR_MAYBE_NULL) {
@@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+	.types = {
+		PTR_TO_STACK,
+		PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+	}
+};
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
-	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
+	[ARG_PTR_TO_DYNPTR]		= &dynptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -6066,6 +6073,13 @@ skip_type_check:
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
+		/* We only need to check for initialized / uninitialized helper
+		 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+		 * assumption is that if it is, that a helper function
+		 * initialized the dynptr on behalf of the BPF program.
+		 */
+		if (base_type(reg->type) == PTR_TO_DYNPTR)
+			break;
 		if (arg_type & MEM_UNINIT) {
 			if (!is_dynptr_reg_valid_uninit(env, reg)) {
 				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -6241,7 +6255,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_USER_RINGBUF:
-		goto error;
+		if (func_id != BPF_FUNC_user_ringbuf_drain)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
@@ -6361,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_RINGBUF)
 			goto error;
 		break;
+	case BPF_FUNC_user_ringbuf_drain:
+		if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+			goto error;
+		break;
 	case BPF_FUNC_get_stackid:
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
@@ -6887,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+					   struct bpf_func_state *caller,
+					   struct bpf_func_state *callee,
+					   int insn_idx)
+{
+	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+	 *			  callback_ctx, u64 flags);
+	 * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+	 */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+	callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -7346,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	case BPF_FUNC_dynptr_data:
 		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			if (arg_type_is_dynptr(fn->arg_type[i])) {
+				struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+
 				if (meta.ref_obj_id) {
 					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
 					return -EFAULT;
 				}
-				/* Find the id of the dynptr we're tracking the reference of */
-				meta.ref_obj_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
+
+				if (base_type(reg->type) != PTR_TO_DYNPTR)
+					/* Find the id of the dynptr we're
+					 * tracking the reference of
+					 */
+					meta.ref_obj_id = stack_slot_get_id(env, reg);
 				break;
 			}
 		}
@@ -7360,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EFAULT;
 		}
 		break;
+	case BPF_FUNC_user_ringbuf_drain:
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_user_ringbuf_callback_state);
+		break;
 	}
 
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e18c85324db6..ead35f39f185 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5388,6 +5388,43 @@ union bpf_attr {
  *	Return
  *		Current *ktime*.
  *
+ * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags)
+ *	Description
+ *		Drain samples from the specified user ring buffer, and invoke
+ *		the provided callback for each such sample:
+ *
+ *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *
+ *		If **callback_fn** returns 0, the helper will continue to try
+ *		and drain the next sample, up to a maximum of
+ *		BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1,
+ *		the helper will skip the rest of the samples and return. Other
+ *		return values are not used now, and will be rejected by the
+ *		verifier.
+ *	Return
+ *		The number of drained samples if no error was encountered while
+ *		draining samples, or 0 if no samples were present in the ring
+ *		buffer. If a user-space producer was epoll-waiting on this map,
+ *		and at least one sample was drained, they will receive an event
+ *		notification notifying them of available space in the ring
+ *		buffer. If the BPF_RB_NO_WAKEUP flag is passed to this
+ *		function, no wakeup notification will be sent. If the
+ *		BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will
+ *		be sent even if no sample was drained.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EBUSY** if the ring buffer is contended, and another calling
+ *		context was concurrently draining the ring buffer.
+ *
+ *		**-EINVAL** if user-space is not properly tracking the ring
+ *		buffer due to the producer position not being aligned to 8
+ *		bytes, a sample not being aligned to 8 bytes, or the producer
+ *		position not matching the advertised length of a sample.
+ *
+ *		**-E2BIG** if user-space has tried to publish a sample which is
+ *		larger than the size of the ring buffer, or which cannot fit
+ *		within a struct bpf_dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5599,6 +5636,7 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
 	FN(ktime_get_tai_ns),		\
+	FN(user_ringbuf_drain),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 0c3e10cb44232833a50cb8e3e784c432906a60c1 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Tue, 20 Sep 2022 18:12:31 -0400
Subject: net: phy: Add support for rate matching

This adds support for rate matching (also known as rate adaptation) to
the phy subsystem. The general idea is that the phy interface runs at
one speed, and the MAC throttles the rate at which it sends packets to
the link speed. There's a good overview of several techniques for
achieving this at [1]. This patch adds support for three: pause-frame
based (such as in Aquantia phys), CRS-based (such as in 10PASS-TS and
2BASE-TL), and open-loop-based (such as in 10GBASE-W).

This patch makes a few assumptions and a few non assumptions about the
types of rate matching available. First, it assumes that different phys
may use different forms of rate matching. Second, it assumes that phys
can use rate matching for any of their supported link speeds (e.g. if a
phy supports 10BASE-T and XGMII, then it can adapt XGMII to 10BASE-T).
Third, it does not assume that all interface modes will use the same
form of rate matching. Fourth, it does not assume that all phy devices
will support rate matching (even if some do). Relaxing or strengthening
these (non-)assumptions could result in a different API. For example, if
all interface modes were assumed to use the same form of rate matching,
then a bitmask of interface modes supportting rate matching would
suffice.

For some better visibility into the process, the current rate matching
mode is exposed as part of the ethtool ksettings. For the moment, only
read access is supported. I'm not sure what userspace might want to
configure yet (disable it altogether, disable just one mode, specify the
mode to use, etc.). For the moment, since only pause-based rate
adaptation support is added in the next few commits, rate matching can
be disabled altogether by adjusting the advertisement.

802.3 calls this feature "rate adaptation" in clause 49 (10GBASE-R) and
"rate matching" in clause 61 (10PASS-TL and 2BASE-TS). Aquantia also calls
this feature "rate adaptation". I chose "rate matching" because it is
shorter, and because Russell doesn't think "adaptation" is correct in this
context.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  2 ++
 drivers/net/phy/phy-core.c                   | 21 +++++++++++++++++++++
 drivers/net/phy/phy.c                        | 28 ++++++++++++++++++++++++++++
 include/linux/phy.h                          | 22 +++++++++++++++++++++-
 include/uapi/linux/ethtool.h                 | 18 ++++++++++++++++--
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/ioctl.c                          |  1 +
 net/ethtool/linkmodes.c                      |  5 +++++
 8 files changed, 95 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index dbca3e9ec782..09fb1d5ba67f 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -426,6 +426,7 @@ Kernel response contents:
   ``ETHTOOL_A_LINKMODES_DUPLEX``              u8      duplex mode
   ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG``    u8      Master/slave port mode
   ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE``  u8      Master/slave port state
+  ``ETHTOOL_A_LINKMODES_RATE_MATCHING``       u8      PHY rate matching
   ==========================================  ======  ==========================
 
 For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask
@@ -449,6 +450,7 @@ Request contents:
   ``ETHTOOL_A_LINKMODES_SPEED``               u32     link speed (Mb/s)
   ``ETHTOOL_A_LINKMODES_DUPLEX``              u8      duplex mode
   ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG``    u8      Master/slave port mode
+  ``ETHTOOL_A_LINKMODES_RATE_MATCHING``       u8      PHY rate matching
   ``ETHTOOL_A_LINKMODES_LANES``               u32     lanes
   ==========================================  ======  ==========================
 
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 2a2924bc8f76..2c8bf438ea61 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -74,6 +74,27 @@ const char *phy_duplex_to_str(unsigned int duplex)
 }
 EXPORT_SYMBOL_GPL(phy_duplex_to_str);
 
+/**
+ * phy_rate_matching_to_str - Return a string describing the rate matching
+ *
+ * @rate_matching: Type of rate matching to describe
+ */
+const char *phy_rate_matching_to_str(int rate_matching)
+{
+	switch (rate_matching) {
+	case RATE_MATCH_NONE:
+		return "none";
+	case RATE_MATCH_PAUSE:
+		return "pause";
+	case RATE_MATCH_CRS:
+		return "crs";
+	case RATE_MATCH_OPEN_LOOP:
+		return "open-loop";
+	}
+	return "Unsupported (update phy-core.c)";
+}
+EXPORT_SYMBOL_GPL(phy_rate_matching_to_str);
+
 /**
  * phy_interface_num_ports - Return the number of links that can be carried by
  *			     a given MAC-PHY physical link. Returns 0 if this is
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 8d3ee3a6495b..e741d8aebffe 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -114,6 +114,33 @@ void phy_print_status(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_print_status);
 
+/**
+ * phy_get_rate_matching - determine if rate matching is supported
+ * @phydev: The phy device to return rate matching for
+ * @iface: The interface mode to use
+ *
+ * This determines the type of rate matching (if any) that @phy supports
+ * using @iface. @iface may be %PHY_INTERFACE_MODE_NA to determine if any
+ * interface supports rate matching.
+ *
+ * Return: The type of rate matching @phy supports for @iface, or
+ *         %RATE_MATCH_NONE.
+ */
+int phy_get_rate_matching(struct phy_device *phydev,
+			  phy_interface_t iface)
+{
+	int ret = RATE_MATCH_NONE;
+
+	if (phydev->drv->get_rate_matching) {
+		mutex_lock(&phydev->lock);
+		ret = phydev->drv->get_rate_matching(phydev, iface);
+		mutex_unlock(&phydev->lock);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_get_rate_matching);
+
 /**
  * phy_config_interrupt - configure the PHY device for the requested interrupts
  * @phydev: the phy_device struct
@@ -256,6 +283,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev,
 	cmd->base.duplex = phydev->duplex;
 	cmd->base.master_slave_cfg = phydev->master_slave_get;
 	cmd->base.master_slave_state = phydev->master_slave_state;
+	cmd->base.rate_matching = phydev->rate_matching;
 	if (phydev->interface == PHY_INTERFACE_MODE_MOCA)
 		cmd->base.port = PORT_BNC;
 	else
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 337230c135f7..9c66f357f489 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -280,7 +280,6 @@ static inline const char *phy_modes(phy_interface_t interface)
 	}
 }
 
-
 #define PHY_INIT_TIMEOUT	100000
 #define PHY_FORCE_TIMEOUT	10
 
@@ -574,6 +573,7 @@ struct macsec_ops;
  * @lp_advertising: Current link partner advertised linkmodes
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
  * @autoneg: Flag autoneg being used
+ * @rate_matching: Current rate matching mode
  * @link: Current link state
  * @autoneg_complete: Flag auto negotiation of the link has completed
  * @mdix: Current crossover
@@ -641,6 +641,8 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
+	int rate_matching;
+
 	enum phy_state state;
 
 	u32 dev_flags;
@@ -805,6 +807,21 @@ struct phy_driver {
 	 */
 	int (*get_features)(struct phy_device *phydev);
 
+	/**
+	 * @get_rate_matching: Get the supported type of rate matching for a
+	 * particular phy interface. This is used by phy consumers to determine
+	 * whether to advertise lower-speed modes for that interface. It is
+	 * assumed that if a rate matching mode is supported on an interface,
+	 * then that interface's rate can be adapted to all slower link speeds
+	 * supported by the phy. If iface is %PHY_INTERFACE_MODE_NA, and the phy
+	 * supports any kind of rate matching for any interface, then it must
+	 * return that rate matching mode (preferring %RATE_MATCH_PAUSE to
+	 * %RATE_MATCH_CRS). If the interface is not supported, this should
+	 * return %RATE_MATCH_NONE.
+	 */
+	int (*get_rate_matching)(struct phy_device *phydev,
+				   phy_interface_t iface);
+
 	/* PHY Power Management */
 	/** @suspend: Suspend the hardware, saving state if needed */
 	int (*suspend)(struct phy_device *phydev);
@@ -971,6 +988,7 @@ struct phy_fixup {
 
 const char *phy_speed_to_str(int speed);
 const char *phy_duplex_to_str(unsigned int duplex);
+const char *phy_rate_matching_to_str(int rate_matching);
 
 int phy_interface_num_ports(phy_interface_t interface);
 
@@ -1687,6 +1705,8 @@ int phy_disable_interrupts(struct phy_device *phydev);
 void phy_request_interrupt(struct phy_device *phydev);
 void phy_free_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
+int phy_get_rate_matching(struct phy_device *phydev,
+			    phy_interface_t iface);
 void phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_advertise_supported(struct phy_device *phydev);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 2d5741fd44bb..fe9893d1485d 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1840,6 +1840,20 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define MASTER_SLAVE_STATE_SLAVE		3
 #define MASTER_SLAVE_STATE_ERR			4
 
+/* These are used to throttle the rate of data on the phy interface when the
+ * native speed of the interface is higher than the link speed. These should
+ * not be used for phy interfaces which natively support multiple speeds (e.g.
+ * MII or SGMII).
+ */
+/* No rate matching performed. */
+#define RATE_MATCH_NONE		0
+/* The phy sends pause frames to throttle the MAC. */
+#define RATE_MATCH_PAUSE	1
+/* The phy asserts CRS to prevent the MAC from transmitting. */
+#define RATE_MATCH_CRS		2
+/* The MAC is programmed with a sufficiently-large IPG. */
+#define RATE_MATCH_OPEN_LOOP	3
+
 /* Which connector port. */
 #define PORT_TP			0x00
 #define PORT_AUI		0x01
@@ -2033,8 +2047,8 @@ enum ethtool_reset_flags {
  *	reported consistently by PHYLIB.  Read-only.
  * @master_slave_cfg: Master/slave port mode.
  * @master_slave_state: Master/slave port state.
+ * @rate_matching: Rate adaptation performed by the PHY
  * @reserved: Reserved for future use; see the note on reserved space.
- * @reserved1: Reserved for future use; see the note on reserved space.
  * @link_mode_masks: Variable length bitmaps.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
@@ -2085,7 +2099,7 @@ struct ethtool_link_settings {
 	__u8	transceiver;
 	__u8	master_slave_cfg;
 	__u8	master_slave_state;
-	__u8	reserved1[1];
+	__u8	rate_matching;
 	__u32	reserved[7];
 	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index d2fb4f7be61b..408a664fad59 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -242,6 +242,7 @@ enum {
 	ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,	/* u8 */
 	ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,	/* u8 */
 	ETHTOOL_A_LINKMODES_LANES,		/* u32 */
+	ETHTOOL_A_LINKMODES_RATE_MATCHING,	/* u8 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_LINKMODES_CNT,
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 9298eb3251cb..57e7238a4136 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -571,6 +571,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
 		= __ETHTOOL_LINK_MODE_MASK_NU32;
 	link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED;
 	link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
+	link_ksettings.base.rate_matching = RATE_MATCH_NONE;
 
 	return store_link_ksettings_for_user(useraddr, &link_ksettings);
 }
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 99b29b4fe947..126e06c713a3 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -70,6 +70,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
 		+ nla_total_size(sizeof(u32)) /* LINKMODES_LANES */
 		+ nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */
+		+ nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */
 		+ 0;
 	ret = ethnl_bitset_size(ksettings->link_modes.advertising,
 				ksettings->link_modes.supported,
@@ -143,6 +144,10 @@ static int linkmodes_fill_reply(struct sk_buff *skb,
 		       lsettings->master_slave_state))
 		return -EMSGSIZE;
 
+	if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING,
+		       lsettings->rate_matching))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 195624d9c26b64c6856863da30ec578a790feec4 Mon Sep 17 00:00:00 2001
From: Patrick Rohr <prohr@google.com>
Date: Tue, 20 Sep 2022 12:48:25 -0700
Subject: tun: support not enabling carrier in TUNSETIFF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds support for not enabling carrier during TUNSETIFF
interface creation by specifying the IFF_NO_CARRIER flag.

Our tests make heavy use of tun interfaces. In some scenarios, the test
process creates the interface but another process brings it up after the
interface is discovered via netlink notification. In that case, it is
not possible to create a tun/tap interface with carrier off without it
racing against the bring up. Immediately setting carrier off via
TUNSETCARRIER is still too late.

Signed-off-by: Patrick Rohr <prohr@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 9 ++++++---
 include/uapi/linux/if_tun.h | 2 ++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 259b2b84b2b3..db736b944016 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,7 +2828,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		rcu_assign_pointer(tfile->tun, tun);
 	}
 
-	netif_carrier_on(tun->dev);
+	if (ifr->ifr_flags & IFF_NO_CARRIER)
+		netif_carrier_off(tun->dev);
+	else
+		netif_carrier_on(tun->dev);
 
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
@@ -3056,8 +3059,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		 * This is needed because we never checked for invalid flags on
 		 * TUNSETIFF.
 		 */
-		return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
-				(unsigned int __user*)argp);
+		return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER |
+				TUN_FEATURES, (unsigned int __user*)argp);
 	} else if (cmd == TUNSETQUEUE) {
 		return tun_set_queue(file, &ifr);
 	} else if (cmd == SIOCGSKNS) {
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 2ec07de1d73b..b6d7b868f290 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -67,6 +67,8 @@
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
 #define IFF_NAPI_FRAGS	0x0020
+/* Used in TUNSETIFF to bring up tun/tap without carrier */
+#define IFF_NO_CARRIER	0x0040
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
cgit v1.2.3


From 77a440e2cbb4b8688b567104f80ce1cda1afbbc4 Mon Sep 17 00:00:00 2001
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Date: Fri, 23 Sep 2022 23:39:14 +0800
Subject: ublk_drv: define macros for recovery feature and check them

Define some macros for recovery feature.

UBLK_S_DEV_QUIESCED implies that ublk_device is quiesced
and is ready for recovery. This state can be observed by userspace.

UBLK_F_USER_RECOVERY implies that:
(1) ublk_drv enables recovery feature. It won't let monitor_work to
    automatically abort rqs and release the device.
(2) With a dying ubq_daemon, ublk_drv ends(aborts) rqs issued to
    userspace(ublksrv) before crash.
(3) With a dying ubq_daemon, in task work and ublk_queue_rq(),
    ublk_drv requeues rqs.

Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220923153919.44078-3-ZiyangZhang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 18 +++++++++++++++++-
 include/uapi/linux/ublk_cmd.h |  3 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c39b67d7133d..05bfbaa49696 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -49,7 +49,8 @@
 /* All UBLK_F_* have to be included into UBLK_F_ALL */
 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
 		| UBLK_F_URING_CMD_COMP_IN_TASK \
-		| UBLK_F_NEED_GET_DATA)
+		| UBLK_F_NEED_GET_DATA \
+		| UBLK_F_USER_RECOVERY)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
@@ -323,6 +324,21 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
 			PAGE_SIZE);
 }
 
+static inline bool ublk_queue_can_use_recovery(
+		struct ublk_queue *ubq)
+{
+	if (ubq->flags & UBLK_F_USER_RECOVERY)
+		return true;
+	return false;
+}
+
+static inline bool ublk_can_use_recovery(struct ublk_device *ub)
+{
+	if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
+		return true;
+	return false;
+}
+
 static void ublk_free_disk(struct gendisk *disk)
 {
 	struct ublk_device *ub = disk->private_data;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 677edaab2b66..340ff14bde49 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -74,9 +74,12 @@
  */
 #define UBLK_F_NEED_GET_DATA (1UL << 2)
 
+#define UBLK_F_USER_RECOVERY	(1UL << 3)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
+#define UBLK_S_DEV_QUIESCED	2
 
 /* shipped via sqe->cmd of io_uring command */
 struct ublksrv_ctrl_cmd {
-- 
cgit v1.2.3


From a0d41dc1137470fd4c5c2ef8fdc244d7565e69e6 Mon Sep 17 00:00:00 2001
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Date: Fri, 23 Sep 2022 23:39:17 +0800
Subject: ublk_drv: support UBLK_F_USER_RECOVERY_REISSUE

UBLK_F_USER_RECOVERY_REISSUE implies that:
With a dying ubq_daemon, ublk_drv let monitor_work requeues rq issued to
userspace(ublksrv) before the ubq_daemon is dying.

UBLK_F_USER_RECOVERY_REISSUE is designed for backends which:
(1) tolerate double-write since ublk_drv may issue the same rq
    twice.
(2) does not let frontend users get I/O error, such as read-only FS
    and VM backend.

Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220923153919.44078-6-ZiyangZhang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 22 ++++++++++++++++++----
 include/uapi/linux/ublk_cmd.h |  2 ++
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 3cdcc9bd635f..76ee41e82973 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -50,7 +50,8 @@
 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
 		| UBLK_F_URING_CMD_COMP_IN_TASK \
 		| UBLK_F_NEED_GET_DATA \
-		| UBLK_F_USER_RECOVERY)
+		| UBLK_F_USER_RECOVERY \
+		| UBLK_F_USER_RECOVERY_REISSUE)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
@@ -325,6 +326,15 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
 			PAGE_SIZE);
 }
 
+static inline bool ublk_queue_can_use_recovery_reissue(
+		struct ublk_queue *ubq)
+{
+	if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
+			(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
+		return true;
+	return false;
+}
+
 static inline bool ublk_queue_can_use_recovery(
 		struct ublk_queue *ubq)
 {
@@ -629,13 +639,17 @@ static void ublk_complete_rq(struct request *req)
  * Also aborting may not be started yet, keep in mind that one failed
  * request may be issued by block layer again.
  */
-static void __ublk_fail_req(struct ublk_io *io, struct request *req)
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+		struct request *req)
 {
 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
 
 	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
 		io->flags |= UBLK_IO_FLAG_ABORTED;
-		blk_mq_end_request(req, BLK_STS_IOERR);
+		if (ublk_queue_can_use_recovery_reissue(ubq))
+			blk_mq_requeue_request(req, false);
+		else
+			blk_mq_end_request(req, BLK_STS_IOERR);
 	}
 }
 
@@ -962,7 +976,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 			 */
 			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
 			if (rq)
-				__ublk_fail_req(io, rq);
+				__ublk_fail_req(ubq, io, rq);
 		}
 	}
 	ublk_put_device(ub);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 340ff14bde49..332370628757 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -76,6 +76,8 @@
 
 #define UBLK_F_USER_RECOVERY	(1UL << 3)
 
+#define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From c732a852b419fa057b53657e2daaf9433940391c Mon Sep 17 00:00:00 2001
From: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Date: Fri, 23 Sep 2022 23:39:18 +0800
Subject: ublk_drv: add START_USER_RECOVERY and END_USER_RECOVERY support

START_USER_RECOVERY and END_USER_RECOVERY are two new control commands
to support user recovery feature.

After a crash, user should send START_USER_RECOVERY, it will:
(1) check if (a)current ublk_device is UBLK_S_DEV_QUIESCED which was
    set by quiesce_work and (b)chardev is released
(2) reinit all ubqs, including:
    (a) put the task_struct and reset ->ubq_daemon to NULL.
    (b) reset all ublk_io.
(3) reset ub->mm to NULL.

Then, user should start a new process and send FETCH_REQ on each
ubq_daemon.

Finally, user should send END_USER_RECOVERY, it will:
(1) wait for all new ubq_daemons getting ready.
(2) update ublksrv_pid
(3) unquiesce the request queue and expect incoming ublk_queue_rq()
(4) convert ub's state to UBLK_S_DEV_LIVE

Note: we can handle STOP_DEV between START_USER_RECOVERY and
END_USER_RECOVERY. This is helpful to users who cannot start new process
after sending START_USER_RECOVERY ctrl-cmd.

Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220923153919.44078-7-ZiyangZhang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 116 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/ublk_cmd.h |   3 +-
 2 files changed, 118 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 76ee41e82973..2651bf41dde3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1861,6 +1861,116 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	int i;
+
+	WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
+	/* All old ioucmds have to be completed */
+	WARN_ON_ONCE(ubq->nr_io_ready);
+	/* old daemon is PF_EXITING, put it now */
+	put_task_struct(ubq->ubq_daemon);
+	/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
+	ubq->ubq_daemon = NULL;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		/* forget everything now and be ready for new FETCH_REQ */
+		io->flags = 0;
+		io->cmd = NULL;
+		io->addr = 0;
+	}
+}
+
+static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub;
+	int ret = -EINVAL;
+	int i;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return ret;
+
+	mutex_lock(&ub->mutex);
+	if (!ublk_can_use_recovery(ub))
+		goto out_unlock;
+	/*
+	 * START_RECOVERY is only allowd after:
+	 *
+	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
+	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
+	 *     released.
+	 *
+	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
+	 *     (a)has quiesced request queue
+	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
+	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
+	 *     (d)has completed/camceled all ioucmds owned by ther dying process
+	 */
+	if (test_bit(UB_STATE_OPEN, &ub->state) ||
+			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
+	/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
+	ub->mm = NULL;
+	ub->nr_queues_ready = 0;
+	init_completion(&ub->completion);
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ublksrv_pid = (int)header->data[0];
+	struct ublk_device *ub;
+	int ret = -EINVAL;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return ret;
+
+	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
+			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
+	/* wait until new ubq_daemon sending all FETCH_REQ */
+	wait_for_completion_interruptible(&ub->completion);
+	pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
+			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
+
+	mutex_lock(&ub->mutex);
+	if (!ublk_can_use_recovery(ub))
+		goto out_unlock;
+
+	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	ub->dev_info.ublksrv_pid = ublksrv_pid;
+	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
+			__func__, ublksrv_pid, header->dev_id);
+	blk_mq_unquiesce_queue(ub->ub_disk->queue);
+	pr_devel("%s: queue unquiesced, dev id %d.\n",
+			__func__, header->dev_id);
+	blk_mq_kick_requeue_list(ub->ub_disk->queue);
+	ub->dev_info.state = UBLK_S_DEV_LIVE;
+	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return ret;
+}
+
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -1902,6 +2012,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_SET_PARAMS:
 		ret = ublk_ctrl_set_params(cmd);
 		break;
+	case UBLK_CMD_START_USER_RECOVERY:
+		ret = ublk_ctrl_start_recovery(cmd);
+		break;
+	case UBLK_CMD_END_USER_RECOVERY:
+		ret = ublk_ctrl_end_recovery(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 332370628757..8f88e3a29998 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -17,7 +17,8 @@
 #define	UBLK_CMD_STOP_DEV	0x07
 #define	UBLK_CMD_SET_PARAMS	0x08
 #define	UBLK_CMD_GET_PARAMS	0x09
-
+#define	UBLK_CMD_START_USER_RECOVERY	0x10
+#define	UBLK_CMD_END_USER_RECOVERY	0x11
 /*
  * IO commands, issued by ublk server, and handled by ublk driver.
  *
-- 
cgit v1.2.3


From 7d37539037c2fca70346fbedc219f655253d5cff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Sat, 24 Sep 2022 07:00:00 +0200
Subject: fuse: implement ->tmpfile()

This is basically equivalent to the FUSE_CREATE operation which creates and
opens a regular file.

Add a new FUSE_TMPFILE operation, otherwise just reuse the protocol and the
code for FUSE_CREATE.

Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dir.c             | 24 +++++++++++++++++++++---
 fs/fuse/fuse_i.h          |  3 +++
 include/uapi/linux/fuse.h |  6 +++++-
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b585b04e815e..bb97a384dc5d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -529,7 +529,7 @@ out_err:
  */
 static int fuse_create_open(struct inode *dir, struct dentry *entry,
 			    struct file *file, unsigned int flags,
-			    umode_t mode)
+			    umode_t mode, u32 opcode)
 {
 	int err;
 	struct inode *inode;
@@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 		inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
 	}
 
-	args.opcode = FUSE_CREATE;
+	args.opcode = opcode;
 	args.nodeid = get_node_id(dir);
 	args.in_numargs = 2;
 	args.in_args[0].size = sizeof(inarg);
@@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 	if (fc->no_create)
 		goto mknod;
 
-	err = fuse_create_open(dir, entry, file, flags, mode);
+	err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
 	if (err == -ENOSYS) {
 		fc->no_create = 1;
 		goto mknod;
@@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
 	return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
 }
 
+static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct file *file, umode_t mode)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	int err;
+
+	if (fc->no_tmpfile)
+		return -EOPNOTSUPP;
+
+	err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+	if (err == -ENOSYS) {
+		fc->no_tmpfile = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
 static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 		      struct dentry *entry, umode_t mode)
 {
@@ -1913,6 +1930,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.setattr	= fuse_setattr,
 	.create		= fuse_create,
 	.atomic_open	= fuse_atomic_open,
+	.tmpfile	= fuse_tmpfile,
 	.mknod		= fuse_mknod,
 	.permission	= fuse_permission,
 	.getattr	= fuse_getattr,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 488b460e046f..98a9cf531873 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -784,6 +784,9 @@ struct fuse_conn {
 	/* Does the filesystem support per inode DAX? */
 	unsigned int inode_dax:1;
 
+	/* Is tmpfile not implemented by fs? */
+	unsigned int no_tmpfile:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index d6ccee961891..76ee8f9e024a 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -194,6 +194,9 @@
  *  - add FUSE_SECURITY_CTX init flag
  *  - add security context to create, mkdir, symlink, and mknod requests
  *  - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX
+ *
+ *  7.37
+ *  - add FUSE_TMPFILE
  */
 
 #ifndef _LINUX_FUSE_H
@@ -229,7 +232,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 36
+#define FUSE_KERNEL_MINOR_VERSION 37
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -537,6 +540,7 @@ enum fuse_opcode {
 	FUSE_SETUPMAPPING	= 48,
 	FUSE_REMOVEMAPPING	= 49,
 	FUSE_SYNCFS		= 50,
+	FUSE_TMPFILE		= 51,
 
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,
-- 
cgit v1.2.3


From 8e2b7442d27ca2a56a116d44d597e77ca21dfed3 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Thu, 9 Jun 2022 12:31:13 +0200
Subject: media: rockchip: rkisp1: Define macros for DPCC configurations in
 UAPI

Extend the UAPI rkisp1-config.h header with macros for all DPCC
configuration fields. While at it, clarify of fix issues in the DPCC
documentation.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Paul Elder <paul.elder@ideasonboard.com>
Reviewed-by: Dafna Hirschfeld <dafna@fastmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 .../media/platform/rockchip/rkisp1/rkisp1-regs.h   |  1 -
 include/uapi/linux/rkisp1-config.h                 | 77 +++++++++++++++++-----
 2 files changed, 61 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
index dc01f968c19d..a931f7216e9b 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
@@ -620,7 +620,6 @@
 /* DPCC */
 #define RKISP1_CIF_ISP_DPCC_MODE_DPCC_ENABLE		BIT(0)
 #define RKISP1_CIF_ISP_DPCC_MODE_GRAYSCALE_MODE		BIT(1)
-#define RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE		BIT(2)
 #define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_MASK		GENMASK(3, 0)
 #define RKISP1_CIF_ISP_DPCC_SET_USE_MASK		GENMASK(3, 0)
 #define RKISP1_CIF_ISP_DPCC_METHODS_SET_MASK		0x00001f1f
diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h
index 583ca0d9a79d..730673ecc63d 100644
--- a/include/uapi/linux/rkisp1-config.h
+++ b/include/uapi/linux/rkisp1-config.h
@@ -117,7 +117,46 @@
 /*
  * Defect Pixel Cluster Correction
  */
-#define RKISP1_CIF_ISP_DPCC_METHODS_MAX       3
+#define RKISP1_CIF_ISP_DPCC_METHODS_MAX				3
+
+#define RKISP1_CIF_ISP_DPCC_MODE_STAGE1_ENABLE			(1U << 2)
+
+#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_INCL_G_CENTER	(1U << 0)
+#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_INCL_RB_CENTER	(1U << 1)
+#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_G_3X3		(1U << 2)
+#define RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_STAGE1_RB_3X3		(1U << 3)
+
+/* 0-2 for sets 1-3 */
+#define RKISP1_CIF_ISP_DPCC_SET_USE_STAGE1_USE_SET(n)		((n) << 0)
+#define RKISP1_CIF_ISP_DPCC_SET_USE_STAGE1_USE_FIX_SET		(1U << 3)
+
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_PG_GREEN_ENABLE		(1U << 0)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_LC_GREEN_ENABLE		(1U << 1)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RO_GREEN_ENABLE		(1U << 2)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RND_GREEN_ENABLE	(1U << 3)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RG_GREEN_ENABLE		(1U << 4)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_PG_RED_BLUE_ENABLE	(1U << 8)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_LC_RED_BLUE_ENABLE	(1U << 9)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RO_RED_BLUE_ENABLE	(1U << 10)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RND_RED_BLUE_ENABLE	(1U << 11)
+#define RKISP1_CIF_ISP_DPCC_METHODS_SET_RG_RED_BLUE_ENABLE	(1U << 12)
+
+#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_G(v)			((v) << 0)
+#define RKISP1_CIF_ISP_DPCC_LINE_THRESH_RB(v)			((v) << 8)
+#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_G(v)			((v) << 0)
+#define RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_RB(v)			((v) << 8)
+#define RKISP1_CIF_ISP_DPCC_PG_FAC_G(v)				((v) << 0)
+#define RKISP1_CIF_ISP_DPCC_PG_FAC_RB(v)			((v) << 8)
+#define RKISP1_CIF_ISP_DPCC_RND_THRESH_G(v)			((v) << 0)
+#define RKISP1_CIF_ISP_DPCC_RND_THRESH_RB(v)			((v) << 8)
+#define RKISP1_CIF_ISP_DPCC_RG_FAC_G(v)				((v) << 0)
+#define RKISP1_CIF_ISP_DPCC_RG_FAC_RB(v)			((v) << 8)
+
+#define RKISP1_CIF_ISP_DPCC_RO_LIMITS_n_G(n, v)			((v) << ((n) * 4))
+#define RKISP1_CIF_ISP_DPCC_RO_LIMITS_n_RB(n, v)		((v) << ((n) * 4 + 2))
+
+#define RKISP1_CIF_ISP_DPCC_RND_OFFS_n_G(n, v)			((v) << ((n) * 4))
+#define RKISP1_CIF_ISP_DPCC_RND_OFFS_n_RB(n, v)			((v) << ((n) * 4 + 2))
 
 /*
  * Denoising pre filter
@@ -249,16 +288,20 @@ struct rkisp1_cif_isp_bls_config {
 };
 
 /**
- * struct rkisp1_cif_isp_dpcc_methods_config - Methods Configuration used by DPCC
+ * struct rkisp1_cif_isp_dpcc_methods_config - DPCC methods set configuration
  *
- * Methods Configuration used by Defect Pixel Cluster Correction
+ * This structure stores the configuration of one set of methods for the DPCC
+ * algorithm. Multiple methods can be selected in each set (independently for
+ * the Green and Red/Blue components) through the @method field, the result is
+ * the logical AND of all enabled methods. The remaining fields set thresholds
+ * and factors for each method.
  *
- * @method: Method enable bits
- * @line_thresh: Line threshold
- * @line_mad_fac: Line MAD factor
- * @pg_fac: Peak gradient factor
- * @rnd_thresh: Rank Neighbor Difference threshold
- * @rg_fac: Rank gradient factor
+ * @method: Method enable bits (RKISP1_CIF_ISP_DPCC_METHODS_SET_*)
+ * @line_thresh: Line threshold (RKISP1_CIF_ISP_DPCC_LINE_THRESH_*)
+ * @line_mad_fac: Line Mean Absolute Difference factor (RKISP1_CIF_ISP_DPCC_LINE_MAD_FAC_*)
+ * @pg_fac: Peak gradient factor (RKISP1_CIF_ISP_DPCC_PG_FAC_*)
+ * @rnd_thresh: Rank Neighbor Difference threshold (RKISP1_CIF_ISP_DPCC_RND_THRESH_*)
+ * @rg_fac: Rank gradient factor (RKISP1_CIF_ISP_DPCC_RG_FAC_*)
  */
 struct rkisp1_cif_isp_dpcc_methods_config {
 	__u32 method;
@@ -272,14 +315,16 @@ struct rkisp1_cif_isp_dpcc_methods_config {
 /**
  * struct rkisp1_cif_isp_dpcc_config - Configuration used by DPCC
  *
- * Configuration used by Defect Pixel Cluster Correction
+ * Configuration used by Defect Pixel Cluster Correction. Three sets of methods
+ * can be configured and selected through the @set_use field. The result is the
+ * logical OR of all enabled sets.
  *
- * @mode: dpcc output mode
- * @output_mode: whether use hard coded methods
- * @set_use: stage1 methods set
- * @methods: methods config
- * @ro_limits: rank order limits
- * @rnd_offs: differential rank offsets for rank neighbor difference
+ * @mode: DPCC mode (RKISP1_CIF_ISP_DPCC_MODE_*)
+ * @output_mode: Interpolation output mode (RKISP1_CIF_ISP_DPCC_OUTPUT_MODE_*)
+ * @set_use: Methods sets selection (RKISP1_CIF_ISP_DPCC_SET_USE_*)
+ * @methods: Methods sets configuration
+ * @ro_limits: Rank order limits (RKISP1_CIF_ISP_DPCC_RO_LIMITS_*)
+ * @rnd_offs: Differential rank offsets for rank neighbor difference (RKISP1_CIF_ISP_DPCC_RND_OFFS_*)
  */
 struct rkisp1_cif_isp_dpcc_config {
 	__u32 mode;
-- 
cgit v1.2.3


From 479747caa5bfa94b856bf47249006e6c8aa8be37 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Tue, 30 Aug 2022 12:37:24 +0200
Subject: media: cec: add support for Absolute Volume Control

Add support for this new CEC message. This was added in HDMI 2.1a.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 Documentation/userspace-api/media/cec.h.rst.exceptions |  2 ++
 drivers/media/cec/core/cec-adap.c                      |  1 +
 include/uapi/linux/cec-funcs.h                         | 14 ++++++++++++++
 include/uapi/linux/cec.h                               |  2 ++
 4 files changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/cec.h.rst.exceptions b/Documentation/userspace-api/media/cec.h.rst.exceptions
index 13de01d9555e..15fa1752d4ef 100644
--- a/Documentation/userspace-api/media/cec.h.rst.exceptions
+++ b/Documentation/userspace-api/media/cec.h.rst.exceptions
@@ -239,6 +239,7 @@ ignore define CEC_OP_FEAT_DEV_HAS_DECK_CONTROL
 ignore define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_RATE
 ignore define CEC_OP_FEAT_DEV_SINK_HAS_ARC_TX
 ignore define CEC_OP_FEAT_DEV_SOURCE_HAS_ARC_RX
+ignore define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_VOLUME_LEVEL
 
 ignore define CEC_MSG_GIVE_FEATURES
 
@@ -487,6 +488,7 @@ ignore define CEC_OP_SYS_AUD_STATUS_ON
 
 ignore define CEC_MSG_SYSTEM_AUDIO_MODE_REQUEST
 ignore define CEC_MSG_SYSTEM_AUDIO_MODE_STATUS
+ignore define CEC_MSG_SET_AUDIO_VOLUME_LEVEL
 
 ignore define CEC_OP_AUD_FMT_ID_CEA861
 ignore define CEC_OP_AUD_FMT_ID_CEA861_CXT
diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
index 41a79293ee02..4f5ab3cae8a7 100644
--- a/drivers/media/cec/core/cec-adap.c
+++ b/drivers/media/cec/core/cec-adap.c
@@ -1027,6 +1027,7 @@ static const u8 cec_msg_size[256] = {
 	[CEC_MSG_REPORT_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED,
 	[CEC_MSG_REQUEST_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED,
 	[CEC_MSG_SET_SYSTEM_AUDIO_MODE] = 3 | BOTH,
+	[CEC_MSG_SET_AUDIO_VOLUME_LEVEL] = 3 | DIRECTED,
 	[CEC_MSG_SYSTEM_AUDIO_MODE_REQUEST] = 2 | DIRECTED,
 	[CEC_MSG_SYSTEM_AUDIO_MODE_STATUS] = 3 | DIRECTED,
 	[CEC_MSG_SET_AUDIO_RATE] = 3 | DIRECTED,
diff --git a/include/uapi/linux/cec-funcs.h b/include/uapi/linux/cec-funcs.h
index c3baaea0b8ef..d58fa1cdcb08 100644
--- a/include/uapi/linux/cec-funcs.h
+++ b/include/uapi/linux/cec-funcs.h
@@ -1568,6 +1568,20 @@ static inline void cec_ops_request_short_audio_descriptor(const struct cec_msg *
 	}
 }
 
+static inline void cec_msg_set_audio_volume_level(struct cec_msg *msg,
+						  __u8 audio_volume_level)
+{
+	msg->len = 3;
+	msg->msg[1] = CEC_MSG_SET_AUDIO_VOLUME_LEVEL;
+	msg->msg[2] = audio_volume_level;
+}
+
+static inline void cec_ops_set_audio_volume_level(const struct cec_msg *msg,
+						  __u8 *audio_volume_level)
+{
+	*audio_volume_level = msg->msg[2];
+}
+
 
 /* Audio Rate Control Feature */
 static inline void cec_msg_set_audio_rate(struct cec_msg *msg,
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 1d48da926216..b8e071abaea5 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -768,6 +768,7 @@ struct cec_event {
 #define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_RATE		0x08
 #define CEC_OP_FEAT_DEV_SINK_HAS_ARC_TX			0x04
 #define CEC_OP_FEAT_DEV_SOURCE_HAS_ARC_RX		0x02
+#define CEC_OP_FEAT_DEV_HAS_SET_AUDIO_VOLUME_LEVEL	0x01
 
 #define CEC_MSG_GIVE_FEATURES				0xa5	/* HDMI 2.0 */
 
@@ -1059,6 +1060,7 @@ struct cec_event {
 #define CEC_OP_AUD_FMT_ID_CEA861			0
 #define CEC_OP_AUD_FMT_ID_CEA861_CXT			1
 
+#define CEC_MSG_SET_AUDIO_VOLUME_LEVEL			0x73
 
 /* Audio Rate Control Feature */
 #define CEC_MSG_SET_AUDIO_RATE				0x9a
-- 
cgit v1.2.3


From 1c56ab991903dce60e905a08f431c0e6f79b9b9e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 9 Aug 2022 13:02:18 +0800
Subject: btrfs: separate BLOCK_GROUP_TREE compat RO flag from EXTENT_TREE_V2

The problem of long mount time caused by block group item search is
already known for some time, and the solution of block group tree has
been proposed.

There is really no need to bound this feature into extent tree v2, just
introduce compat RO flag, BLOCK_GROUP_TREE, to correctly solve the
problem.

All the code handling block group root is already in the upstream
kernel, thus this patch really only needs to introduce the new compat RO
flag.

This patch introduces one extra artificial limitation on block group
tree feature, that free space cache v2 and no-holes feature must be
enabled to use this new compat RO feature.

This artificial requirement is mostly to reduce the test combinations,
and can be a guideline for future features, to mostly rely on the latest
default features.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h           |  3 ++-
 fs/btrfs/disk-io.c         | 26 ++++++++++++++------------
 fs/btrfs/disk-io.h         |  2 +-
 fs/btrfs/sysfs.c           |  2 ++
 include/uapi/linux/btrfs.h |  6 ++++++
 5 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c38daa822ef..7007c7974a2e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -302,7 +302,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
 	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
 	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
-	 BTRFS_FEATURE_COMPAT_RO_VERITY)
+	 BTRFS_FEATURE_COMPAT_RO_VERITY |		\
+	 BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfc9b3aea76a..120b7697ab42 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1981,7 +1981,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_chunk_root_level(root_backup,
 			       btrfs_header_level(info->chunk_root->node));
 
-	if (!btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+	if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
 		struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
 		struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
 
@@ -2526,7 +2526,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = 0;
 
-	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
 		location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
 		root = btrfs_read_tree_root(tree_root, &location);
 		if (IS_ERR(root)) {
@@ -2711,6 +2711,18 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
+	/*
+	 * Artificial requirement for block-group-tree to force newer features
+	 * (free-space-tree, no-holes) so the test matrix is smaller.
+	 */
+	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+	    (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+	     !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+		btrfs_err(fs_info,
+		"block-group-tree feature requires fres-space-tree and no-holes");
+		ret = -EINVAL;
+	}
+
 	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
 		   BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
@@ -2880,16 +2892,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 	int ret = 0;
 	int i;
 
-	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
-		struct btrfs_root *root;
-
-		root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
-					GFP_KERNEL);
-		if (!root)
-			return -ENOMEM;
-		fs_info->block_group_root = root;
-	}
-
 	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
 		if (handle_error) {
 			if (!IS_ERR(tree_root->node))
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index aef981de672c..7e545ec09a10 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -105,7 +105,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
-	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
 		return fs_info->block_group_root;
 	return btrfs_extent_root(fs_info, 0);
 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 84a992681801..6816bc41c228 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -286,6 +286,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
 BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
 #ifdef CONFIG_BLK_DEV_ZONED
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
@@ -317,6 +318,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(metadata_uuid),
 	BTRFS_FEAT_ATTR_PTR(free_space_tree),
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
+	BTRFS_FEAT_ATTR_PTR(block_group_tree),
 #ifdef CONFIG_BLK_DEV_ZONED
 	BTRFS_FEAT_ATTR_PTR(zoned),
 #endif
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 7ada84e4a3ed..5655e89b962b 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -290,6 +290,12 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID	(1ULL << 1)
 #define BTRFS_FEATURE_COMPAT_RO_VERITY			(1ULL << 2)
 
+/*
+ * Put all block group items into a dedicated block group tree, greatly
+ * reducing mount time for large filesystem due to better locality.
+ */
+#define BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE	(1ULL << 3)
+
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
-- 
cgit v1.2.3


From e71564c0438a1c0cffc5c8eb302ec5d849103b08 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 24 Aug 2022 09:14:06 +0800
Subject: btrfs: introduce BTRFS_QGROUP_STATUS_FLAGS_MASK for later expansion

Currently we only have 3 qgroup flags:

- BTRFS_QGROUP_STATUS_FLAG_ON
- BTRFS_QGROUP_STATUS_FLAG_RESCAN
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT

These flags match the on-disk flags used in btrfs_qgroup_status.

But we're going to introduce extra runtime flags which will not reach
disks.

So here we introduce a new mask, BTRFS_QGROUP_STATUS_FLAGS_MASK, to
make sure only those flags can reach disks.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c               | 6 ++++--
 include/uapi/linux/btrfs_tree.h | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ba323dcb0a0b..fc2ed19ced9b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -878,7 +878,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
-	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 	btrfs_set_qgroup_status_rescan(l, ptr,
 				fs_info->qgroup_rescan_progress.objectid);
@@ -1052,7 +1053,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
 	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
 				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 5f32a2a495dc..1f7a38ec6ac3 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -965,6 +965,10 @@ static inline __u16 btrfs_qgroup_level(__u64 qgroupid)
  */
 #define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
 
+#define BTRFS_QGROUP_STATUS_FLAGS_MASK	(BTRFS_QGROUP_STATUS_FLAG_ON |		\
+					 BTRFS_QGROUP_STATUS_FLAG_RESCAN |	\
+					 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)
+
 #define BTRFS_QGROUP_STATUS_VERSION        1
 
 struct btrfs_qgroup_status_item {
-- 
cgit v1.2.3


From 0e253f7e558a3e250902ba2034091e0185448836 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:39 +0200
Subject: bpf: Return value in kprobe get_func_ip only for entry address

Changing return value of kprobe's version of bpf_get_func_ip
to return zero if the attach address is not on the function's
entry point.

For kprobes attached in the middle of the function we can't easily
get to the function address especially now with the CONFIG_X86_KERNEL_IBT
support.

If user cares about current IP for kprobes attached within the
function body, they can get it with PT_REGS_IP(ctx).

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-6-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                             | 1 +
 kernel/trace/bpf_trace.c                             | 5 ++++-
 tools/include/uapi/linux/bpf.h                       | 1 +
 tools/testing/selftests/bpf/progs/get_func_ip_test.c | 4 ++--
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ead35f39f185..d6bd10759eaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4951,6 +4951,7 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ * 		0 for kprobes placed within the function (not at the entry).
  *
  * u64 bpf_get_attach_cookie(void *ctx)
  * 	Description
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebd1b348beb3..688552df95ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1048,7 +1048,10 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
 {
 	struct kprobe *kp = kprobe_running();
 
-	return kp ? (uintptr_t)kp->addr : 0;
+	if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+		return 0;
+
+	return get_entry_ip((uintptr_t)kp->addr);
 }
 
 static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ead35f39f185..d6bd10759eaf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4951,6 +4951,7 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ * 		0 for kprobes placed within the function (not at the entry).
  *
  * u64 bpf_get_attach_cookie(void *ctx)
  * 	Description
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index a587aeca5ae0..6db70757bc8b 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -69,7 +69,7 @@ int test6(struct pt_regs *ctx)
 {
 	__u64 addr = bpf_get_func_ip(ctx);
 
-	test6_result = (const void *) addr == &bpf_fentry_test6 + 5;
+	test6_result = (const void *) addr == 0;
 	return 0;
 }
 
@@ -79,6 +79,6 @@ int test7(struct pt_regs *ctx)
 {
 	__u64 addr = bpf_get_func_ip(ctx);
 
-	test7_result = (const void *) addr == &bpf_fentry_test7 + 5;
+	test7_result = (const void *) addr == 0;
 	return 0;
 }
-- 
cgit v1.2.3


From 73dfe93ea1b319482e6d82a54fe06f953ceeeccb Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 22 Sep 2022 20:41:40 +0200
Subject: headers: Remove some left-over license text

Remove some left-over from commit e2be04c7f995 ("License cleanup: add SPDX
license identifier to uapi header files with a license")

When the SPDX-License-Identifier tag has been added, the corresponding
license text has not been removed.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Alexander Duyck <alexanderduyck@fb.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/88410cddd31197ea26840d7dd71612bece8c6acf.1663871981.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tc_act/tc_bpf.h        |  5 -----
 include/uapi/linux/tc_act/tc_skbedit.h    | 13 -------------
 include/uapi/linux/tc_act/tc_skbmod.h     |  7 +------
 include/uapi/linux/tc_act/tc_tunnel_key.h |  5 -----
 include/uapi/linux/tc_act/tc_vlan.h       |  5 -----
 tools/include/uapi/linux/tc_act/tc_bpf.h  |  5 -----
 6 files changed, 1 insertion(+), 39 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 653c4f94f76e..fe6c8f8f3e8c 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -1,11 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
  */
 
 #ifndef __LINUX_TC_BPF_H
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index 6cb6101208d0..64032513cc4c 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -2,19 +2,6 @@
 /*
  * Copyright (c) 2008, Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
  * Author: Alexander Duyck <alexander.h.duyck@intel.com>
  */
 
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
index af6ef2cfbf3d..ac62c9a993ea 100644
--- a/include/uapi/linux/tc_act/tc_skbmod.h
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -1,12 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2016, Jamal Hadi Salim
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
-*/
+ */
 
 #ifndef __LINUX_TC_SKBMOD_H
 #define __LINUX_TC_SKBMOD_H
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 3f10dc4e7a4b..49ad4033951b 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -2,11 +2,6 @@
 /*
  * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
  * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
  */
 
 #ifndef __LINUX_TC_TUNNEL_KEY_H
diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index 5b306fe815cc..3e1f8e57cdd2 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -1,11 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
  */
 
 #ifndef __LINUX_TC_VLAN_H
diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h
index 653c4f94f76e..fe6c8f8f3e8c 100644
--- a/tools/include/uapi/linux/tc_act/tc_bpf.h
+++ b/tools/include/uapi/linux/tc_act/tc_bpf.h
@@ -1,11 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
  */
 
 #ifndef __LINUX_TC_BPF_H
-- 
cgit v1.2.3


From 62e56ef57c04c0cacb33433d7984a4d71b690b3f Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 25 Sep 2022 15:00:33 +0000
Subject: net: tls: Add ARIA-GCM algorithm

RFC 6209 describes ARIA for TLS 1.2.
ARIA-128-GCM and ARIA-256-GCM are defined in RFC 6209.

This patch would offer performance increment and an opportunity for
hardware offload.

Benchmark results:
iperf-ssl are used.
CPU: intel i3-12100.

  TLS(openssl-3.0-dev)
[  3]  0.0- 1.0 sec   185 MBytes  1.55 Gbits/sec
[  3]  1.0- 2.0 sec   186 MBytes  1.56 Gbits/sec
[  3]  2.0- 3.0 sec   186 MBytes  1.56 Gbits/sec
[  3]  3.0- 4.0 sec   186 MBytes  1.56 Gbits/sec
[  3]  4.0- 5.0 sec   186 MBytes  1.56 Gbits/sec
[  3]  0.0- 5.0 sec   927 MBytes  1.56 Gbits/sec
  kTLS(aria-generic)
[  3]  0.0- 1.0 sec   198 MBytes  1.66 Gbits/sec
[  3]  1.0- 2.0 sec   194 MBytes  1.62 Gbits/sec
[  3]  2.0- 3.0 sec   194 MBytes  1.63 Gbits/sec
[  3]  3.0- 4.0 sec   194 MBytes  1.63 Gbits/sec
[  3]  4.0- 5.0 sec   194 MBytes  1.62 Gbits/sec
[  3]  0.0- 5.0 sec   974 MBytes  1.63 Gbits/sec
  kTLS(aria-avx wirh GFNI)
[  3]  0.0- 1.0 sec   632 MBytes  5.30 Gbits/sec
[  3]  1.0- 2.0 sec   657 MBytes  5.51 Gbits/sec
[  3]  2.0- 3.0 sec   657 MBytes  5.51 Gbits/sec
[  3]  3.0- 4.0 sec   656 MBytes  5.50 Gbits/sec
[  3]  4.0- 5.0 sec   656 MBytes  5.50 Gbits/sec
[  3]  0.0- 5.0 sec  3.18 GBytes  5.47 Gbits/sec

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Reviewed-by: Vadim Fedorenko <vfedorenko@novek.ru>
Link: https://lore.kernel.org/r/20220925150033.24615-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tls.h | 30 +++++++++++++++++++++++
 net/tls/tls_main.c       | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_sw.c         | 34 ++++++++++++++++++++++++++
 3 files changed, 126 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index f1157d8f4acd..b66a800389cc 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -100,6 +100,20 @@
 #define TLS_CIPHER_SM4_CCM_TAG_SIZE		16
 #define TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE		8
 
+#define TLS_CIPHER_ARIA_GCM_128				57
+#define TLS_CIPHER_ARIA_GCM_128_IV_SIZE			8
+#define TLS_CIPHER_ARIA_GCM_128_KEY_SIZE		16
+#define TLS_CIPHER_ARIA_GCM_128_SALT_SIZE		4
+#define TLS_CIPHER_ARIA_GCM_128_TAG_SIZE		16
+#define TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE		8
+
+#define TLS_CIPHER_ARIA_GCM_256				58
+#define TLS_CIPHER_ARIA_GCM_256_IV_SIZE			8
+#define TLS_CIPHER_ARIA_GCM_256_KEY_SIZE		32
+#define TLS_CIPHER_ARIA_GCM_256_SALT_SIZE		4
+#define TLS_CIPHER_ARIA_GCM_256_TAG_SIZE		16
+#define TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE		8
+
 #define TLS_SET_RECORD_TYPE	1
 #define TLS_GET_RECORD_TYPE	2
 
@@ -156,6 +170,22 @@ struct tls12_crypto_info_sm4_ccm {
 	unsigned char rec_seq[TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE];
 };
 
+struct tls12_crypto_info_aria_gcm_128 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_ARIA_GCM_128_IV_SIZE];
+	unsigned char key[TLS_CIPHER_ARIA_GCM_128_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_ARIA_GCM_128_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE];
+};
+
+struct tls12_crypto_info_aria_gcm_256 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_ARIA_GCM_256_IV_SIZE];
+	unsigned char key[TLS_CIPHER_ARIA_GCM_256_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_ARIA_GCM_256_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE];
+};
+
 enum {
 	TLS_INFO_UNSPEC,
 	TLS_INFO_VERSION,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 5cc6911cc97d..3735cb00905d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -524,6 +524,54 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval,
 			rc = -EFAULT;
 		break;
 	}
+	case TLS_CIPHER_ARIA_GCM_128: {
+		struct tls12_crypto_info_aria_gcm_128 *
+		  crypto_info_aria_gcm_128 =
+		  container_of(crypto_info,
+			       struct tls12_crypto_info_aria_gcm_128,
+			       info);
+
+		if (len != sizeof(*crypto_info_aria_gcm_128)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		lock_sock(sk);
+		memcpy(crypto_info_aria_gcm_128->iv,
+		       cctx->iv + TLS_CIPHER_ARIA_GCM_128_SALT_SIZE,
+		       TLS_CIPHER_ARIA_GCM_128_IV_SIZE);
+		memcpy(crypto_info_aria_gcm_128->rec_seq, cctx->rec_seq,
+		       TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE);
+		release_sock(sk);
+		if (copy_to_user(optval,
+				 crypto_info_aria_gcm_128,
+				 sizeof(*crypto_info_aria_gcm_128)))
+			rc = -EFAULT;
+		break;
+	}
+	case TLS_CIPHER_ARIA_GCM_256: {
+		struct tls12_crypto_info_aria_gcm_256 *
+		  crypto_info_aria_gcm_256 =
+		  container_of(crypto_info,
+			       struct tls12_crypto_info_aria_gcm_256,
+			       info);
+
+		if (len != sizeof(*crypto_info_aria_gcm_256)) {
+			rc = -EINVAL;
+			goto out;
+		}
+		lock_sock(sk);
+		memcpy(crypto_info_aria_gcm_256->iv,
+		       cctx->iv + TLS_CIPHER_ARIA_GCM_256_SALT_SIZE,
+		       TLS_CIPHER_ARIA_GCM_256_IV_SIZE);
+		memcpy(crypto_info_aria_gcm_256->rec_seq, cctx->rec_seq,
+		       TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE);
+		release_sock(sk);
+		if (copy_to_user(optval,
+				 crypto_info_aria_gcm_256,
+				 sizeof(*crypto_info_aria_gcm_256)))
+			rc = -EFAULT;
+		break;
+	}
 	default:
 		rc = -EINVAL;
 	}
@@ -685,6 +733,20 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
 	case TLS_CIPHER_SM4_CCM:
 		optsize = sizeof(struct tls12_crypto_info_sm4_ccm);
 		break;
+	case TLS_CIPHER_ARIA_GCM_128:
+		if (crypto_info->version != TLS_1_2_VERSION) {
+			rc = -EINVAL;
+			goto err_crypto_info;
+		}
+		optsize = sizeof(struct tls12_crypto_info_aria_gcm_128);
+		break;
+	case TLS_CIPHER_ARIA_GCM_256:
+		if (crypto_info->version != TLS_1_2_VERSION) {
+			rc = -EINVAL;
+			goto err_crypto_info;
+		}
+		optsize = sizeof(struct tls12_crypto_info_aria_gcm_256);
+		break;
 	default:
 		rc = -EINVAL;
 		goto err_crypto_info;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index fe27241cd13f..264cf367e265 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2629,6 +2629,40 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		cipher_name = "ccm(sm4)";
 		break;
 	}
+	case TLS_CIPHER_ARIA_GCM_128: {
+		struct tls12_crypto_info_aria_gcm_128 *aria_gcm_128_info;
+
+		aria_gcm_128_info = (void *)crypto_info;
+		nonce_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE;
+		tag_size = TLS_CIPHER_ARIA_GCM_128_TAG_SIZE;
+		iv_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE;
+		iv = aria_gcm_128_info->iv;
+		rec_seq_size = TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE;
+		rec_seq = aria_gcm_128_info->rec_seq;
+		keysize = TLS_CIPHER_ARIA_GCM_128_KEY_SIZE;
+		key = aria_gcm_128_info->key;
+		salt = aria_gcm_128_info->salt;
+		salt_size = TLS_CIPHER_ARIA_GCM_128_SALT_SIZE;
+		cipher_name = "gcm(aria)";
+		break;
+	}
+	case TLS_CIPHER_ARIA_GCM_256: {
+		struct tls12_crypto_info_aria_gcm_256 *gcm_256_info;
+
+		gcm_256_info = (void *)crypto_info;
+		nonce_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE;
+		tag_size = TLS_CIPHER_ARIA_GCM_256_TAG_SIZE;
+		iv_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE;
+		iv = gcm_256_info->iv;
+		rec_seq_size = TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE;
+		rec_seq = gcm_256_info->rec_seq;
+		keysize = TLS_CIPHER_ARIA_GCM_256_KEY_SIZE;
+		key = gcm_256_info->key;
+		salt = gcm_256_info->salt;
+		salt_size = TLS_CIPHER_ARIA_GCM_256_SALT_SIZE;
+		cipher_name = "gcm(aria)";
+		break;
+	}
 	default:
 		rc = -EINVAL;
 		goto free_priv;
-- 
cgit v1.2.3


From 3137f2e60098bccdf9b3a744747b06a96addab9e Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Mon, 26 Sep 2022 14:07:58 +0300
Subject: firmware/psci: Add debugfs support to ease debugging

To ease debugging of PSCI supported features, add debugfs file called
'psci' describing PSCI and SMC CC versions, enabled features and
options.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20220926110758.666922-1-dmitry.baryshkov@linaro.org'
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/firmware/psci/psci.c | 116 ++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/psci.h    |  14 ++++++
 2 files changed, 129 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 1628f1edef4a..42cae0ba10e2 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -9,6 +9,7 @@
 #include <linux/acpi.h>
 #include <linux/arm-smccc.h>
 #include <linux/cpuidle.h>
+#include <linux/debugfs.h>
 #include <linux/errno.h>
 #include <linux/linkage.h>
 #include <linux/of.h>
@@ -326,12 +327,125 @@ static void psci_sys_poweroff(void)
 	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
 }
 
-static int __init psci_features(u32 psci_func_id)
+static int psci_features(u32 psci_func_id)
 {
 	return invoke_psci_fn(PSCI_1_0_FN_PSCI_FEATURES,
 			      psci_func_id, 0, 0);
 }
 
+#ifdef CONFIG_DEBUG_FS
+
+#define PSCI_ID(ver, _name) \
+	{ .fn = PSCI_##ver##_FN_##_name, .name = #_name, }
+#define PSCI_ID_NATIVE(ver, _name) \
+	{ .fn = PSCI_FN_NATIVE(ver, _name), .name = #_name, }
+
+/* A table of all optional functions */
+static const struct {
+	u32 fn;
+	const char *name;
+} psci_fn_ids[] = {
+	PSCI_ID_NATIVE(0_2, MIGRATE),
+	PSCI_ID(0_2, MIGRATE_INFO_TYPE),
+	PSCI_ID_NATIVE(0_2, MIGRATE_INFO_UP_CPU),
+	PSCI_ID(1_0, CPU_FREEZE),
+	PSCI_ID_NATIVE(1_0, CPU_DEFAULT_SUSPEND),
+	PSCI_ID_NATIVE(1_0, NODE_HW_STATE),
+	PSCI_ID_NATIVE(1_0, SYSTEM_SUSPEND),
+	PSCI_ID(1_0, SET_SUSPEND_MODE),
+	PSCI_ID_NATIVE(1_0, STAT_RESIDENCY),
+	PSCI_ID_NATIVE(1_0, STAT_COUNT),
+	PSCI_ID_NATIVE(1_1, SYSTEM_RESET2),
+	PSCI_ID(1_1, MEM_PROTECT),
+	PSCI_ID_NATIVE(1_1, MEM_PROTECT_CHECK_RANGE),
+};
+
+static int psci_debugfs_read(struct seq_file *s, void *data)
+{
+	int feature, type, i;
+	u32 ver;
+
+	ver = psci_ops.get_version();
+	seq_printf(s, "PSCIv%d.%d\n",
+		   PSCI_VERSION_MAJOR(ver),
+		   PSCI_VERSION_MINOR(ver));
+
+	/* PSCI_FEATURES is available only starting from 1.0 */
+	if (PSCI_VERSION_MAJOR(ver) < 1)
+		return 0;
+
+	feature = psci_features(ARM_SMCCC_VERSION_FUNC_ID);
+	if (feature != PSCI_RET_NOT_SUPPORTED) {
+		ver = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0);
+		seq_printf(s, "SMC Calling Convention v%d.%d\n",
+			   PSCI_VERSION_MAJOR(ver),
+			   PSCI_VERSION_MINOR(ver));
+	} else {
+		seq_puts(s, "SMC Calling Convention v1.0 is assumed\n");
+	}
+
+	feature = psci_features(PSCI_FN_NATIVE(0_2, CPU_SUSPEND));
+	if (feature < 0) {
+		seq_printf(s, "PSCI_FEATURES(CPU_SUSPEND) error (%d)\n", feature);
+	} else {
+		seq_printf(s, "OSI is %ssupported\n",
+			   (feature & BIT(0)) ? "" : "not ");
+		seq_printf(s, "%s StateID format is used\n",
+			   (feature & BIT(1)) ? "Extended" : "Original");
+	}
+
+	type = psci_ops.migrate_info_type();
+	if (type == PSCI_0_2_TOS_UP_MIGRATE ||
+	    type == PSCI_0_2_TOS_UP_NO_MIGRATE) {
+		unsigned long cpuid;
+
+		seq_printf(s, "Trusted OS %smigrate capable\n",
+			   type == PSCI_0_2_TOS_UP_NO_MIGRATE ? "not " : "");
+		cpuid = psci_migrate_info_up_cpu();
+		seq_printf(s, "Trusted OS resident on physical CPU 0x%lx (#%d)\n",
+			   cpuid, resident_cpu);
+	} else if (type == PSCI_0_2_TOS_MP) {
+		seq_puts(s, "Trusted OS migration not required\n");
+	} else {
+		if (type != PSCI_RET_NOT_SUPPORTED)
+			seq_printf(s, "MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(psci_fn_ids); i++) {
+		feature = psci_features(psci_fn_ids[i].fn);
+		if (feature == PSCI_RET_NOT_SUPPORTED)
+			continue;
+		if (feature < 0)
+			seq_printf(s, "PSCI_FEATURES(%s) error (%d)\n",
+				   psci_fn_ids[i].name, feature);
+		else
+			seq_printf(s, "%s is supported\n", psci_fn_ids[i].name);
+	}
+
+	return 0;
+}
+
+static int psci_debugfs_open(struct inode *inode, struct file *f)
+{
+	return single_open(f, psci_debugfs_read, NULL);
+}
+
+static const struct file_operations psci_debugfs_ops = {
+	.owner = THIS_MODULE,
+	.open = psci_debugfs_open,
+	.release = single_release,
+	.read = seq_read,
+	.llseek = seq_lseek
+};
+
+static int __init psci_debugfs_init(void)
+{
+	return PTR_ERR_OR_ZERO(debugfs_create_file("psci", 0444, NULL, NULL,
+						   &psci_debugfs_ops));
+}
+late_initcall(psci_debugfs_init)
+#endif
+
 #ifdef CONFIG_CPU_IDLE
 static int psci_suspend_finisher(unsigned long state)
 {
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 2bf93c0d6354..3511095c2702 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -48,12 +48,26 @@
 #define PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU	PSCI_0_2_FN64(7)
 
 #define PSCI_1_0_FN_PSCI_FEATURES		PSCI_0_2_FN(10)
+#define PSCI_1_0_FN_CPU_FREEZE			PSCI_0_2_FN(11)
+#define PSCI_1_0_FN_CPU_DEFAULT_SUSPEND		PSCI_0_2_FN(12)
+#define PSCI_1_0_FN_NODE_HW_STATE		PSCI_0_2_FN(13)
 #define PSCI_1_0_FN_SYSTEM_SUSPEND		PSCI_0_2_FN(14)
 #define PSCI_1_0_FN_SET_SUSPEND_MODE		PSCI_0_2_FN(15)
+#define PSCI_1_0_FN_STAT_RESIDENCY		PSCI_0_2_FN(16)
+#define PSCI_1_0_FN_STAT_COUNT			PSCI_0_2_FN(17)
+
 #define PSCI_1_1_FN_SYSTEM_RESET2		PSCI_0_2_FN(18)
+#define PSCI_1_1_FN_MEM_PROTECT			PSCI_0_2_FN(19)
+#define PSCI_1_1_FN_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN(19)
 
+#define PSCI_1_0_FN64_CPU_DEFAULT_SUSPEND	PSCI_0_2_FN64(12)
+#define PSCI_1_0_FN64_NODE_HW_STATE		PSCI_0_2_FN64(13)
 #define PSCI_1_0_FN64_SYSTEM_SUSPEND		PSCI_0_2_FN64(14)
+#define PSCI_1_0_FN64_STAT_RESIDENCY		PSCI_0_2_FN64(16)
+#define PSCI_1_0_FN64_STAT_COUNT		PSCI_0_2_FN64(17)
+
 #define PSCI_1_1_FN64_SYSTEM_RESET2		PSCI_0_2_FN64(18)
+#define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN64(19)
 
 /* PSCI v0.2 power state encoding for CPU_SUSPEND function */
 #define PSCI_0_2_POWER_STATE_ID_MASK		0xffff
-- 
cgit v1.2.3


From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:53 -0700
Subject: bpf: Parameterize task iterators.

Allow creating an iterator that loops through resources of one
thread/process.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com
---
 include/linux/bpf.h            |  25 ++++++
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/task_iter.c         | 188 ++++++++++++++++++++++++++++++++++++-----
 tools/include/uapi/linux/bpf.h |   6 ++
 4 files changed, 203 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5161fac0513f..0f3eaf3ed98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ *	Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ *	Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ *	Iterate over resources of every task of a process / task group.
+ */
+enum bpf_iter_task_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+	BPF_TASK_ITER_TGID,
+};
+
 struct bpf_iter_aux_info {
 	/* for map_elem iter */
 	struct bpf_map *map;
@@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info {
 		struct cgroup *start; /* starting cgroup */
 		enum bpf_cgroup_iter_order order;
 	} cgroup;
+	struct {
+		enum bpf_iter_task_type	type;
+		u32 pid;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d6bd10759eaf..455b21a53aac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..8b2f47e7139d 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,9 @@
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	enum bpf_iter_task_type	type;
+	u32 pid;
+	u32 pid_visiting;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +25,115 @@ struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+						   u32 *tid,
+						   bool skip_if_dup_files)
+{
+	struct task_struct *task, *next_task;
+	struct pid *pid;
+	u32 saved_tid;
+
+	if (!*tid) {
+		/* The first time, the iterator calls this function. */
+		pid = find_pid_ns(common->pid, common->ns);
+		if (!pid)
+			return NULL;
+
+		task = get_pid_task(pid, PIDTYPE_TGID);
+		if (!task)
+			return NULL;
+
+		*tid = common->pid;
+		common->pid_visiting = common->pid;
+
+		return task;
+	}
+
+	/* If the control returns to user space and comes back to the
+	 * kernel again, *tid and common->pid_visiting should be the
+	 * same for task_seq_start() to pick up the correct task.
+	 */
+	if (*tid == common->pid_visiting) {
+		pid = find_pid_ns(common->pid_visiting, common->ns);
+		task = get_pid_task(pid, PIDTYPE_PID);
+
+		return task;
+	}
+
+	pid = find_pid_ns(common->pid_visiting, common->ns);
+	if (!pid)
+		return NULL;
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return NULL;
+
+retry:
+	if (!pid_alive(task)) {
+		put_task_struct(task);
+		return NULL;
+	}
+
+	next_task = next_thread(task);
+	put_task_struct(task);
+	if (!next_task)
+		return NULL;
+
+	saved_tid = *tid;
+	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+	if (!*tid || *tid == common->pid) {
+		/* Run out of tasks of a process.  The tasks of a
+		 * thread_group are linked as circular linked list.
+		 */
+		*tid = saved_tid;
+		return NULL;
+	}
+
+	get_task_struct(next_task);
+	common->pid_visiting = *tid;
+
+	if (skip_if_dup_files && task->files == task->group_leader->files) {
+		task = next_task;
+		goto retry;
+	}
+
+	return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->pid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_TGID);
+			*tid = common->pid;
+		}
+		rcu_read_unlock();
+
+		return task;
+	}
+
+	if (common->type == BPF_TASK_ITER_TGID) {
+		rcu_read_lock();
+		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+		rcu_read_unlock();
+
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +156,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -73,7 +173,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +217,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	unsigned int flags;
+	struct pid *pid;
+	pid_t tgid;
+
+	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+		return -EINVAL;
+
+	aux->task.type = BPF_TASK_ITER_ALL;
+	if (linfo->task.tid != 0) {
+		aux->task.type = BPF_TASK_ITER_TID;
+		aux->task.pid = linfo->task.tid;
+	}
+	if (linfo->task.pid != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+		aux->task.pid = linfo->task.pid;
+	}
+	if (linfo->task.pid_fd != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+
+		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+		if (IS_ERR(pid))
+			return PTR_ERR(pid);
+
+		tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+		aux->task.pid = tgid;
+		put_pid(pid);
+	}
+
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +272,7 @@ struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +285,18 @@ again:
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +317,15 @@ again:
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +406,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->pid = aux->task.pid;
+
 	return 0;
 }
 
@@ -307,11 +447,10 @@ enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +510,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +568,12 @@ again:
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +674,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +693,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +714,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6bd10759eaf..455b21a53aac 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
-- 
cgit v1.2.3


From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:54 -0700
Subject: bpf: Handle bpf_link_info for the parameterized task BPF iterators.

Add new fields to bpf_link_info that users can query it through
bpf_obj_get_info_by_fd().

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/task_iter.c         | 18 ++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 3 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 455b21a53aac..3075018a4ef8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6265,6 +6265,10 @@ struct bpf_link_info {
 					__u64 cgroup_id;
 					__u32 order;
 				} cgroup;
+				struct {
+					__u32 tid;
+					__u32 pid;
+				} task;
 			};
 		} iter;
 		struct  {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8b2f47e7139d..46f836be22e2 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -672,6 +672,21 @@ static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
 };
 
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+	switch (aux->task.type) {
+	case BPF_TASK_ITER_TID:
+		info->iter.task.tid = aux->task.pid;
+		break;
+	case BPF_TASK_ITER_TGID:
+		info->iter.task.pid = aux->task.pid;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.attach_target		= bpf_iter_attach_task,
@@ -682,6 +697,7 @@ static struct bpf_iter_reg task_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -703,6 +719,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_file_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -724,6 +741,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_vma_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 455b21a53aac..3075018a4ef8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6265,6 +6265,10 @@ struct bpf_link_info {
 					__u64 cgroup_id;
 					__u32 order;
 				} cgroup;
+				struct {
+					__u32 tid;
+					__u32 pid;
+				} task;
 			};
 		} iter;
 		struct  {
-- 
cgit v1.2.3


From 1260cd04a601e0e02e09fa332111b8639611970d Mon Sep 17 00:00:00 2001
From: Nate Yocom <nate@yocom.org>
Date: Wed, 28 Sep 2022 18:23:22 -0700
Subject: Input: add ABS_PROFILE to uapi and documentation

Define new ABS_PROFILE axis for input devices which need it, e.g. X-Box
Adaptive Controller and X-Box Elite 2.

Signed-off-by: Nate Yocom <nate@yocom.org>
Link: https://lore.kernel.org/r/20220908173930.28940-4-nate@yocom.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/input/event-codes.rst    | 6 ++++++
 Documentation/input/gamepad.rst        | 6 ++++++
 drivers/hid/hid-debug.c                | 3 ++-
 include/uapi/linux/input-event-codes.h | 1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst
index 8741d390b184..b4557462edd7 100644
--- a/Documentation/input/event-codes.rst
+++ b/Documentation/input/event-codes.rst
@@ -235,6 +235,12 @@ A few EV_ABS codes have special meanings:
     BTN_TOOL_<name> signals the type of tool that is currently detected by the
     hardware and is otherwise independent of ABS_DISTANCE and/or BTN_TOUCH.
 
+* ABS_PROFILE:
+
+  - Used to describe the state of a multi-value profile switch.  An event is
+    emitted only when the selected profile changes, indicating the newly
+    selected profile value.
+
 * ABS_MT_<name>:
 
   - Used to describe multitouch input events. Please see
diff --git a/Documentation/input/gamepad.rst b/Documentation/input/gamepad.rst
index 4d5e7fb80a84..71019de46036 100644
--- a/Documentation/input/gamepad.rst
+++ b/Documentation/input/gamepad.rst
@@ -189,3 +189,9 @@ Gamepads report the following events:
 - Rumble:
 
   Rumble is advertised as FF_RUMBLE.
+
+- Profile:
+
+  Some pads provide a multi-value profile selection switch.  An example is the
+  XBox Adaptive and the XBox Elite 2 controllers.  When the active profile is
+  switched, its newly selected value is emitted as an ABS_PROFILE event.
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 81e7e404a5fc..2ca6ab600bc9 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -1014,7 +1014,8 @@ static const char *absolutes[ABS_CNT] = {
 	[ABS_HAT3Y] = "Hat 3Y",		[ABS_PRESSURE] = "Pressure",
 	[ABS_DISTANCE] = "Distance",	[ABS_TILT_X] = "XTilt",
 	[ABS_TILT_Y] = "YTilt",		[ABS_TOOL_WIDTH] = "ToolWidth",
-	[ABS_VOLUME] = "Volume",	[ABS_MISC] = "Misc",
+	[ABS_VOLUME] = "Volume",	[ABS_PROFILE] = "Profile",
+	[ABS_MISC] = "Misc",
 	[ABS_MT_TOUCH_MAJOR] = "MTMajor",
 	[ABS_MT_TOUCH_MINOR] = "MTMinor",
 	[ABS_MT_WIDTH_MAJOR] = "MTMajorW",
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index dff8e7f17074..7ad931a32970 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -862,6 +862,7 @@
 #define ABS_TOOL_WIDTH		0x1c
 
 #define ABS_VOLUME		0x20
+#define ABS_PROFILE		0x21
 
 #define ABS_MISC		0x28
 
-- 
cgit v1.2.3


From 17601bfed909fa080fcfd227b57da2bd4dc2d2a6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 26 Sep 2022 15:51:16 +0100
Subject: KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option

In order to differenciate between architectures that require no extra
synchronisation when accessing the dirty ring and those who do,
add a new capability (KVM_CAP_DIRTY_LOG_RING_ACQ_REL) that identify
the latter sort. TSO architectures can obviously advertise both, while
relaxed architectures must only advertise the ACQ_REL version.

This requires some configuration symbol rejigging, with HAVE_KVM_DIRTY_RING
being only indirectly selected by two top-level config symbols:
- HAVE_KVM_DIRTY_RING_TSO for strongly ordered architectures (x86)
- HAVE_KVM_DIRTY_RING_ACQ_REL for weakly ordered architectures (arm64)

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/20220926145120.27974-3-maz@kernel.org
---
 arch/x86/kvm/Kconfig     |  2 +-
 include/uapi/linux/kvm.h |  1 +
 virt/kvm/Kconfig         | 14 ++++++++++++++
 virt/kvm/kvm_main.c      |  9 ++++++++-
 4 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd7706136..876748b236ff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,7 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_IRQFD
-	select HAVE_KVM_DIRTY_RING
+	select HAVE_KVM_DIRTY_RING_TSO
 	select IRQ_BYPASS_MANAGER
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eed0315a77a6..0d5d4419139a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index a8c5c9f06b3c..800f9470e36b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -19,6 +19,20 @@ config HAVE_KVM_IRQ_ROUTING
 config HAVE_KVM_DIRTY_RING
        bool
 
+# Only strongly ordered architectures can select this, as it doesn't
+# put any explicit constraint on userspace ordering. They can also
+# select the _ACQ_REL version.
+config HAVE_KVM_DIRTY_RING_TSO
+       bool
+       select HAVE_KVM_DIRTY_RING
+       depends on X86
+
+# Weakly ordered architectures can only select this, advertising
+# to userspace the additional ordering requirements.
+config HAVE_KVM_DIRTY_RING_ACQ_REL
+       bool
+       select HAVE_KVM_DIRTY_RING
+
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 584a5bab3af3..5b064dbadaf4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4475,7 +4475,13 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
 	case KVM_CAP_DIRTY_LOG_RING:
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
+		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+		return 0;
+#endif
+	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
 		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
 #else
 		return 0;
@@ -4580,6 +4586,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 		return 0;
 	}
 	case KVM_CAP_DIRTY_LOG_RING:
+	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
 		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
-- 
cgit v1.2.3


From ee3e88dfec23153d0675b5d00522297b9adf657c Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@amd.com>
Date: Wed, 28 Sep 2022 15:27:51 +0530
Subject: perf/mem: Introduce PERF_MEM_LVLNUM_{EXTN_MEM|IO}

PERF_MEM_LVLNUM_EXTN_MEM which can be used to indicate accesses to
extension memory like CXL etc. PERF_MEM_LVL_IO can be used for IO
accesses but it can not distinguish between local and remote IO.
Introduce new field PERF_MEM_LVLNUM_IO which can be clubbed with
PERF_MEM_REMOTE_REMOTE to indicate Remote IO accesses.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220928095805.596-2-ravi.bangoria@amd.com
---
 include/uapi/linux/perf_event.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e639c74cf5fb..4ae3c249f675 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1336,7 +1336,9 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
 #define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
 #define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
-/* 5-0xa available */
+/* 5-0x8 available */
+#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */
+#define PERF_MEM_LVLNUM_IO	0x0a /* I/O */
 #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
 #define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
 #define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
-- 
cgit v1.2.3


From cfef80bad4cf79cdc964a53c98254dfa462be83f Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@amd.com>
Date: Wed, 28 Sep 2022 15:27:57 +0530
Subject: perf/uapi: Define PERF_MEM_SNOOPX_PEER in kernel header file

PERF_MEM_SNOOPX_PEER is defined only in tools uapi header. Although
it's used only by perf tool, not defining it in kernel header can
create problems in future.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220928095805.596-8-ravi.bangoria@amd.com
---
 include/uapi/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4ae3c249f675..85be78e0e7f6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1356,7 +1356,7 @@ union perf_mem_data_src {
 #define PERF_MEM_SNOOP_SHIFT	19
 
 #define PERF_MEM_SNOOPX_FWD	0x01 /* forward */
-/* 1 free */
+#define PERF_MEM_SNOOPX_PEER	0x02 /* xfer from peer */
 #define PERF_MEM_SNOOPX_SHIFT  38
 
 /* locked instruction */
-- 
cgit v1.2.3


From 2fff00c81d4c37a037cf704d2d219fbcb45aea3c Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 23 Sep 2022 17:42:07 +0200
Subject: landlock: Fix documentation style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems that all code should use double backquotes, which is also used
to convert "%" defines.  Let's use an homogeneous style and remove all
use of simple backquotes (which should only be used for emphasis).

Cc: Günther Noack <gnoack3000@gmail.com>
Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20220923154207.3311629-4-mic@digikod.net
---
 Documentation/security/landlock.rst      |  4 ++--
 Documentation/userspace-api/landlock.rst | 25 ++++++++++----------
 include/uapi/linux/landlock.h            | 10 ++++----
 security/landlock/syscalls.c             | 40 ++++++++++++++++----------------
 4 files changed, 40 insertions(+), 39 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index cc9617f3175b..c0029d5d02eb 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -54,8 +54,8 @@ content of a listed inode.  Indeed, a file name is local to its parent
 directory, and an inode can be referenced by multiple file names thanks to
 (hard) links.  Being able to unlink a file only has a direct impact on the
 directory, not the unlinked inode.  This is the reason why
-`LANDLOCK_ACCESS_FS_REMOVE_FILE` or `LANDLOCK_ACCESS_FS_REFER` are not allowed
-to be tied to files but only to directories.
+``LANDLOCK_ACCESS_FS_REMOVE_FILE`` or ``LANDLOCK_ACCESS_FS_REFER`` are not
+allowed to be tied to files but only to directories.
 
 Tests
 =====
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index 83bae71bf042..cec780c2f497 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -69,7 +69,7 @@ should try to protect users as much as possible whatever the kernel they are
 using.  To avoid binary enforcement (i.e. either all security features or
 none), we can leverage a dedicated Landlock command to get the current version
 of the Landlock ABI and adapt the handled accesses.  Let's check if we should
-remove the `LANDLOCK_ACCESS_FS_REFER` access right which is only supported
+remove the ``LANDLOCK_ACCESS_FS_REFER`` access right which is only supported
 starting with the second version of the ABI.
 
 .. code-block:: c
@@ -128,7 +128,7 @@ descriptor.
 It may also be required to create rules following the same logic as explained
 for the ruleset creation, by filtering access rights according to the Landlock
 ABI version.  In this example, this is not required because
-`LANDLOCK_ACCESS_FS_REFER` is not allowed by any rule.
+``LANDLOCK_ACCESS_FS_REFER`` is not allowed by any rule.
 
 We now have a ruleset with one rule allowing read access to ``/usr`` while
 denying all other handled accesses for the filesystem.  The next step is to
@@ -154,8 +154,8 @@ The current thread is now ready to sandbox itself with the ruleset.
     }
     close(ruleset_fd);
 
-If the `landlock_restrict_self` system call succeeds, the current thread is now
-restricted and this policy will be enforced on all its subsequently created
+If the ``landlock_restrict_self`` system call succeeds, the current thread is
+now restricted and this policy will be enforced on all its subsequently created
 children as well.  Once a thread is landlocked, there is no way to remove its
 security policy; only adding more restrictions is allowed.  These threads are
 now in a new Landlock domain, merge of their parent one (if any) with the new
@@ -175,7 +175,8 @@ depend on their location (i.e. parent directories).  This is particularly
 relevant when we want to allow linking or renaming.  Indeed, having consistent
 access rights per directory enables to change the location of such directory
 without relying on the destination directory access rights (except those that
-are required for this operation, see `LANDLOCK_ACCESS_FS_REFER` documentation).
+are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
+documentation).
 Having self-sufficient hierarchies also helps to tighten the required access
 rights to the minimal set of data.  This also helps avoid sinkhole directories,
 i.e.  directories where data can be linked to but not linked from.  However,
@@ -259,7 +260,7 @@ Backward and forward compatibility
 
 Landlock is designed to be compatible with past and future versions of the
 kernel.  This is achieved thanks to the system call attributes and the
-associated bitflags, particularly the ruleset's `handled_access_fs`.  Making
+associated bitflags, particularly the ruleset's ``handled_access_fs``.  Making
 handled access right explicit enables the kernel and user space to have a clear
 contract with each other.  This is required to make sure sandboxing will not
 get stricter with a system update, which could break applications.
@@ -394,7 +395,7 @@ according to the potentially lost constraints.  To protect against privilege
 escalations through renaming or linking, and for the sake of simplicity,
 Landlock previously limited linking and renaming to the same directory.
 Starting with the Landlock ABI version 2, it is now possible to securely
-control renaming and linking thanks to the new `LANDLOCK_ACCESS_FS_REFER`
+control renaming and linking thanks to the new ``LANDLOCK_ACCESS_FS_REFER``
 access right.
 
 .. _kernel_support:
@@ -403,14 +404,14 @@ Kernel support
 ==============
 
 Landlock was first introduced in Linux 5.13 but it must be configured at build
-time with `CONFIG_SECURITY_LANDLOCK=y`.  Landlock must also be enabled at boot
+time with ``CONFIG_SECURITY_LANDLOCK=y``.  Landlock must also be enabled at boot
 time as the other security modules.  The list of security modules enabled by
-default is set with `CONFIG_LSM`.  The kernel configuration should then
-contains `CONFIG_LSM=landlock,[...]` with `[...]`  as the list of other
+default is set with ``CONFIG_LSM``.  The kernel configuration should then
+contains ``CONFIG_LSM=landlock,[...]`` with ``[...]``  as the list of other
 potentially useful security modules for the running system (see the
-`CONFIG_LSM` help).
+``CONFIG_LSM`` help).
 
-If the running kernel does not have `landlock` in `CONFIG_LSM`, then we can
+If the running kernel does not have ``landlock`` in ``CONFIG_LSM``, then we can
 still enable it by adding ``lsm=landlock,[...]`` to
 Documentation/admin-guide/kernel-parameters.rst thanks to the bootloader
 configuration.
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 23df4e0e8ace..9c4bcc37a455 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -26,7 +26,7 @@ struct landlock_ruleset_attr {
 	 * Landlock filesystem access rights that are not part of
 	 * handled_access_fs are allowed.  This is needed for backward
 	 * compatibility reasons.  One exception is the
-	 * LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly
+	 * %LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly
 	 * handled, but must still be explicitly handled to add new rules with
 	 * this access right.
 	 */
@@ -128,11 +128,11 @@ struct landlock_path_beneath_attr {
  *   hierarchy must also always have the same or a superset of restrictions of
  *   the source hierarchy.  If it is not the case, or if the domain doesn't
  *   handle this access right, such actions are denied by default with errno
- *   set to EXDEV.  Linking also requires a LANDLOCK_ACCESS_FS_MAKE_* access
- *   right on the destination directory, and renaming also requires a
- *   LANDLOCK_ACCESS_FS_REMOVE_* access right on the source's (file or
+ *   set to ``EXDEV``.  Linking also requires a ``LANDLOCK_ACCESS_FS_MAKE_*``
+ *   access right on the destination directory, and renaming also requires a
+ *   ``LANDLOCK_ACCESS_FS_REMOVE_*`` access right on the source's (file or
  *   directory) parent.  Otherwise, such actions are denied with errno set to
- *   EACCES.  The EACCES errno prevails over EXDEV to let user space
+ *   ``EACCES``.  The ``EACCES`` errno prevails over ``EXDEV`` to let user space
  *   efficiently deal with an unrecoverable error.
  *
  * .. warning::
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 735a0865ea11..2ca0ccbd905a 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -149,10 +149,10 @@ static const struct file_operations ruleset_fops = {
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: unknown @flags, or unknown access, or too small @size;
- * - E2BIG or EFAULT: @attr or @size inconsistencies;
- * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: unknown @flags, or unknown access, or too small @size;
+ * - %E2BIG or %EFAULT: @attr or @size inconsistencies;
+ * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
  */
 SYSCALL_DEFINE3(landlock_create_ruleset,
 		const struct landlock_ruleset_attr __user *const, attr,
@@ -280,7 +280,7 @@ out_fdput:
  * @ruleset_fd: File descriptor tied to the ruleset that should be extended
  *		with the new rule.
  * @rule_type: Identify the structure type pointed to by @rule_attr (only
- *             LANDLOCK_RULE_PATH_BENEATH for now).
+ *             %LANDLOCK_RULE_PATH_BENEATH for now).
  * @rule_attr: Pointer to a rule (only of type &struct
  *             landlock_path_beneath_attr for now).
  * @flags: Must be 0.
@@ -290,17 +290,17 @@ out_fdput:
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
  *   &landlock_path_beneath_attr.allowed_access is not a subset of the
  *   ruleset handled accesses);
- * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
- * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
+ * - %ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
+ * - %EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
  *   member of @rule_attr is not a file descriptor as expected;
- * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
+ * - %EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
  *   @rule_attr is not the expected file descriptor type;
- * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
- * - EFAULT: @rule_attr inconsistency.
+ * - %EPERM: @ruleset_fd has no write access to the underlying ruleset;
+ * - %EFAULT: @rule_attr inconsistency.
  */
 SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 		const enum landlock_rule_type, rule_type,
@@ -378,20 +378,20 @@ out_put_ruleset:
  * @flags: Must be 0.
  *
  * This system call enables to enforce a Landlock ruleset on the current
- * thread.  Enforcing a ruleset requires that the task has CAP_SYS_ADMIN in its
+ * thread.  Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
  * namespace or is running with no_new_privs.  This avoids scenarios where
  * unprivileged tasks can affect the behavior of privileged children.
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: @flags is not 0.
- * - EBADF: @ruleset_fd is not a file descriptor for the current thread;
- * - EBADFD: @ruleset_fd is not a ruleset file descriptor;
- * - EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: @flags is not 0.
+ * - %EBADF: @ruleset_fd is not a file descriptor for the current thread;
+ * - %EBADFD: @ruleset_fd is not a ruleset file descriptor;
+ * - %EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
  *   current thread is not running with no_new_privs, or it doesn't have
- *   CAP_SYS_ADMIN in its namespace.
- * - E2BIG: The maximum number of stacked rulesets is reached for the current
+ *   %CAP_SYS_ADMIN in its namespace.
+ * - %E2BIG: The maximum number of stacked rulesets is reached for the current
  *   thread.
  */
 SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
-- 
cgit v1.2.3


From 5493a2ad0d20944b16aba7ed7a951a43ad1f5fba Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 27 Sep 2022 14:23:06 -0700
Subject: docs: netlink: clarify the historical baggage of Netlink flags

nlmsg_flags are full of historical baggage, inconsistencies and
strangeness. Try to document it more thoroughly. Explain the meaning
of the ECHO flag (and while at it clarify the comment in the uAPI).
Handwave a little about the NEW request flags and how they make
sense on the surface but cater to really old paradigm before commands
were a thing.

I will add more notes on how to make use of ECHO and discouragement
for reuse of flags to the kernel-side documentation.

Link: https://lore.kernel.org/r/20220927212306.823862-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/userspace-api/netlink/intro.rst | 61 +++++++++++++++++++++------
 include/uapi/linux/netlink.h                  |  2 +-
 2 files changed, 49 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/netlink/intro.rst b/Documentation/userspace-api/netlink/intro.rst
index 8f1220756412..0955e9f203d3 100644
--- a/Documentation/userspace-api/netlink/intro.rst
+++ b/Documentation/userspace-api/netlink/intro.rst
@@ -623,22 +623,57 @@ Even though other protocols and Generic Netlink commands often use
 the same verbs in their message names (``GET``, ``SET``) the concept
 of request types did not find wider adoption.
 
-Message flags
--------------
+Notification echo
+-----------------
+
+``NLM_F_ECHO`` requests for notifications resulting from the request
+to be queued onto the requesting socket. This is useful to discover
+the impact of the request.
+
+Note that this feature is not universally implemented.
+
+Other request-type-specific flags
+---------------------------------
+
+Classic Netlink defined various flags for its ``GET``, ``NEW``
+and ``DEL`` requests in the upper byte of nlmsg_flags in struct nlmsghdr.
+Since request types have not been generalized the request type specific
+flags are rarely used (and considered deprecated for new families).
+
+For ``GET`` - ``NLM_F_ROOT`` and ``NLM_F_MATCH`` are combined into
+``NLM_F_DUMP``, and not used separately. ``NLM_F_ATOMIC`` is never used.
+
+For ``DEL`` - ``NLM_F_NONREC`` is only used by nftables and ``NLM_F_BULK``
+only by FDB some operations.
+
+The flags for ``NEW`` are used most commonly in classic Netlink. Unfortunately,
+the meaning is not crystal clear. The following description is based on the
+best guess of the intention of the authors, and in practice all families
+stray from it in one way or another. ``NLM_F_REPLACE`` asks to replace
+an existing object, if no matching object exists the operation should fail.
+``NLM_F_EXCL`` has the opposite semantics and only succeeds if object already
+existed.
+``NLM_F_CREATE`` asks for the object to be created if it does not
+exist, it can be combined with ``NLM_F_REPLACE`` and ``NLM_F_EXCL``.
+
+A comment in the main Netlink uAPI header states::
+
+   4.4BSD ADD		NLM_F_CREATE|NLM_F_EXCL
+   4.4BSD CHANGE	NLM_F_REPLACE
 
-The earlier section has already covered the basic request flags
-(``NLM_F_REQUEST``, ``NLM_F_ACK``, ``NLM_F_DUMP``) and the ``NLMSG_ERROR`` /
-``NLMSG_DONE`` flags (``NLM_F_CAPPED``, ``NLM_F_ACK_TLVS``).
-Dump flags were also mentioned (``NLM_F_MULTI``, ``NLM_F_DUMP_INTR``).
+   True CHANGE		NLM_F_CREATE|NLM_F_REPLACE
+   Append		NLM_F_CREATE
+   Check		NLM_F_EXCL
 
-Those are the main flags of note, with a small exception (of ``ieee802154``)
-Generic Netlink does not make use of other flags. If the protocol needs
-to communicate special constraints for a request it should use
-an attribute, not the flags in struct nlmsghdr.
+which seems to indicate that those flags predate request types.
+``NLM_F_REPLACE`` without ``NLM_F_CREATE`` was initially used instead
+of ``SET`` commands.
+``NLM_F_EXCL`` without ``NLM_F_CREATE`` was used to check if object exists
+without creating it, presumably predating ``GET`` commands.
 
-Classic Netlink, however, defined various flags for its ``GET``, ``NEW``
-and ``DEL`` requests. Since request types have not been generalized
-the request type specific flags should not be used either.
+``NLM_F_APPEND`` indicates that if one key can have multiple objects associated
+with it (e.g. multiple next-hop objects for a route) the new object should be
+added to the list rather than replacing the entire list.
 
 uAPI reference
 ==============
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index e0689dbd2cde..e2ae82e3f9f7 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -62,7 +62,7 @@ struct nlmsghdr {
 #define NLM_F_REQUEST		0x01	/* It is request message. 	*/
 #define NLM_F_MULTI		0x02	/* Multipart message, terminated by NLMSG_DONE */
 #define NLM_F_ACK		0x04	/* Reply with ack, with zero or error code */
-#define NLM_F_ECHO		0x08	/* Echo this request 		*/
+#define NLM_F_ECHO		0x08	/* Receive resulting notifications */
 #define NLM_F_DUMP_INTR		0x10	/* Dump was inconsistent due to sequence change */
 #define NLM_F_DUMP_FILTERED	0x20	/* Dump was filtered as requested */
 
-- 
cgit v1.2.3


From a54fc09e4cba3004443aa05979f8c678196c8226 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 28 Sep 2022 12:51:58 +0300
Subject: net/sched: taprio: allow user input of per-tc max SDU

IEEE 802.1Q clause 12.29.1.1 "The queueMaxSDUTable structure and data
types" and 8.6.8.4 "Enhancements for scheduled traffic" talk about the
existence of a per traffic class limitation of maximum frame sizes, with
a fallback on the port-based MTU.

As far as I am able to understand, the 802.1Q Service Data Unit (SDU)
represents the MAC Service Data Unit (MSDU, i.e. L2 payload), excluding
any number of prepended VLAN headers which may be otherwise present in
the MSDU. Therefore, the queueMaxSDU is directly comparable to the
device MTU (1500 means L2 payload sizes are accepted, or frame sizes of
1518 octets, or 1522 plus one VLAN header). Drivers which offload this
are directly responsible of translating into other units of measurement.

To keep the fast path checks optimized, we keep 2 arrays in the qdisc,
one for max_sdu translated into frame length (so that it's comparable to
skb->len), and another for offloading and for dumping back to the user.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h        |   5 ++
 include/uapi/linux/pkt_sched.h |  11 +++
 net/sched/sch_taprio.c         | 152 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 167 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 34600292fdfb..38207873eda6 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -160,6 +160,10 @@ struct tc_etf_qopt_offload {
 	s32 queue;
 };
 
+struct tc_taprio_caps {
+	bool supports_queue_max_sdu:1;
+};
+
 struct tc_taprio_sched_entry {
 	u8 command; /* TC_TAPRIO_CMD_* */
 
@@ -173,6 +177,7 @@ struct tc_taprio_qopt_offload {
 	ktime_t base_time;
 	u64 cycle_time;
 	u64 cycle_time_extension;
+	u32 max_sdu[TC_MAX_QUEUE];
 
 	size_t num_entries;
 	struct tc_taprio_sched_entry entries[];
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f292b467b27f..000eec106856 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1232,6 +1232,16 @@ enum {
 #define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST	_BITUL(0)
 #define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD	_BITUL(1)
 
+enum {
+	TCA_TAPRIO_TC_ENTRY_UNSPEC,
+	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
+	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
+
+	/* add new constants above here */
+	__TCA_TAPRIO_TC_ENTRY_CNT,
+	TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1)
+};
+
 enum {
 	TCA_TAPRIO_ATTR_UNSPEC,
 	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1245,6 +1255,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
 	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
 	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
+	TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 0bc6d90e1e51..435d866fcfa0 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -78,6 +78,8 @@ struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
+	u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
+	u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
 	u32 txtime_delay;
 };
 
@@ -415,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
 			      struct Qdisc *child, struct sk_buff **to_free)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	int prio = skb->priority;
+	u8 tc;
 
 	/* sk_flags are only safe to use on full sockets. */
 	if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
@@ -426,6 +431,11 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
 			return qdisc_drop(skb, sch, to_free);
 	}
 
+	/* Devices with full offload are expected to honor this in hardware */
+	tc = netdev_get_prio_tc_map(dev, prio);
+	if (skb->len > q->max_frm_len[tc])
+		return qdisc_drop(skb, sch, to_free);
+
 	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
@@ -754,6 +764,11 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
 	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
 };
 
+static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
+	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
+	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
+};
+
 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
 		.len = sizeof(struct tc_mqprio_qopt)
@@ -766,6 +781,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
 	[TCA_TAPRIO_ATTR_FLAGS]                      = { .type = NLA_U32 },
 	[TCA_TAPRIO_ATTR_TXTIME_DELAY]		     = { .type = NLA_U32 },
+	[TCA_TAPRIO_ATTR_TC_ENTRY]		     = { .type = NLA_NESTED },
 };
 
 static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
@@ -1216,7 +1232,8 @@ static int taprio_enable_offload(struct net_device *dev,
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct tc_taprio_qopt_offload *offload;
-	int err = 0;
+	struct tc_taprio_caps caps;
+	int tc, err = 0;
 
 	if (!ops->ndo_setup_tc) {
 		NL_SET_ERR_MSG(extack,
@@ -1224,6 +1241,19 @@ static int taprio_enable_offload(struct net_device *dev,
 		return -EOPNOTSUPP;
 	}
 
+	qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+				 &caps, sizeof(caps));
+
+	if (!caps.supports_queue_max_sdu) {
+		for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
+			if (q->max_sdu[tc]) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "Device does not handle queueMaxSDU");
+				return -EOPNOTSUPP;
+			}
+		}
+	}
+
 	offload = taprio_offload_alloc(sched->num_entries);
 	if (!offload) {
 		NL_SET_ERR_MSG(extack,
@@ -1233,6 +1263,9 @@ static int taprio_enable_offload(struct net_device *dev,
 	offload->enable = 1;
 	taprio_sched_to_offload(dev, sched, offload);
 
+	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
+		offload->max_sdu[tc] = q->max_sdu[tc];
+
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
 	if (err < 0) {
 		NL_SET_ERR_MSG(extack,
@@ -1367,6 +1400,89 @@ out:
 	return err;
 }
 
+static int taprio_parse_tc_entry(struct Qdisc *sch,
+				 struct nlattr *opt,
+				 u32 max_sdu[TC_QOPT_MAX_QUEUE],
+				 unsigned long *seen_tcs,
+				 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
+	struct net_device *dev = qdisc_dev(sch);
+	u32 val = 0;
+	int err, tc;
+
+	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
+			       taprio_tc_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
+		NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
+		return -EINVAL;
+	}
+
+	tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
+	if (tc >= TC_QOPT_MAX_QUEUE) {
+		NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
+		return -ERANGE;
+	}
+
+	if (*seen_tcs & BIT(tc)) {
+		NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
+		return -EINVAL;
+	}
+
+	*seen_tcs |= BIT(tc);
+
+	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
+		val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+
+	if (val > dev->max_mtu) {
+		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+		return -ERANGE;
+	}
+
+	max_sdu[tc] = val;
+
+	return 0;
+}
+
+static int taprio_parse_tc_entries(struct Qdisc *sch,
+				   struct nlattr *opt,
+				   struct netlink_ext_ack *extack)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	u32 max_sdu[TC_QOPT_MAX_QUEUE];
+	unsigned long seen_tcs = 0;
+	struct nlattr *n;
+	int tc, rem;
+	int err = 0;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		max_sdu[tc] = q->max_sdu[tc];
+
+	nla_for_each_nested(n, opt, rem) {
+		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
+			continue;
+
+		err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack);
+		if (err)
+			goto out;
+	}
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+		q->max_sdu[tc] = max_sdu[tc];
+		if (max_sdu[tc])
+			q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len;
+		else
+			q->max_frm_len[tc] = U32_MAX; /* never oversized */
+	}
+
+out:
+	return err;
+}
+
 static int taprio_mqprio_cmp(const struct net_device *dev,
 			     const struct tc_mqprio_qopt *mqprio)
 {
@@ -1445,6 +1561,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		return err;
 
+	err = taprio_parse_tc_entries(sch, opt, extack);
+	if (err)
+		return err;
+
 	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
 	if (!new_admin) {
 		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
@@ -1825,6 +1945,33 @@ error_nest:
 	return -1;
 }
 
+static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
+{
+	struct nlattr *n;
+	int tc;
+
+	for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
+		n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
+		if (!n)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
+				q->max_sdu[tc]))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, n);
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, n);
+	return -EMSGSIZE;
+}
+
 static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
@@ -1863,6 +2010,9 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
+	if (taprio_dump_tc_entries(q, skb))
+		goto options_error;
+
 	if (oper && dump_schedule(skb, oper))
 		goto options_error;
 
-- 
cgit v1.2.3


From 650ae67bbf7ba5ac193f053969612fbb93247b64 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Tue, 27 Sep 2022 18:53:38 -0400
Subject: counter: Introduce the Signal polarity component

The Signal polarity component represents the active level of a
respective Signal. There are two possible states: positive (rising edge)
and negative (falling edge); enum counter_signal_polarity represents
these states. A convenience macro COUNTER_COMP_POLARITY() is provided
for driver authors to declare a Signal polarity component.

Cc: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/8f47d6e1db71a11bb1e2666f8e2a6e9d256d4131.1664204990.git.william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/b6e53438badcb6318997d13dd2fc052f97d808ac.1664318353.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-counter | 14 ++++++++++++++
 drivers/counter/counter-chrdev.c            |  1 +
 drivers/counter/counter-sysfs.c             | 12 ++++++++++++
 include/linux/counter.h                     | 10 ++++++++++
 include/uapi/linux/counter.h                |  6 ++++++
 5 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-counter b/Documentation/ABI/testing/sysfs-bus-counter
index 06c2b3e27e0b..2a996deabe9e 100644
--- a/Documentation/ABI/testing/sysfs-bus-counter
+++ b/Documentation/ABI/testing/sysfs-bus-counter
@@ -217,6 +217,7 @@ What:		/sys/bus/counter/devices/counterX/signalY/cable_fault_component_id
 What:		/sys/bus/counter/devices/counterX/signalY/cable_fault_enable_component_id
 What:		/sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler_component_id
 What:		/sys/bus/counter/devices/counterX/signalY/index_polarity_component_id
+What:		/sys/bus/counter/devices/counterX/signalY/polarity_component_id
 What:		/sys/bus/counter/devices/counterX/signalY/synchronous_mode_component_id
 KernelVersion:	5.16
 Contact:	linux-iio@vger.kernel.org
@@ -303,6 +304,19 @@ Description:
 		Discrete set of available values for the respective Signal Y
 		configuration are listed in this file.
 
+What:		/sys/bus/counter/devices/counterX/signalY/polarity
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Active level of Signal Y. The following polarity values are
+		available:
+
+		positive:
+			Signal high state considered active level (rising edge).
+
+		negative:
+			Signal low state considered active level (falling edge).
+
 What:		/sys/bus/counter/devices/counterX/signalY/name
 KernelVersion:	5.2
 Contact:	linux-iio@vger.kernel.org
diff --git a/drivers/counter/counter-chrdev.c b/drivers/counter/counter-chrdev.c
index 4e71a19d7e6a..120879ee2e87 100644
--- a/drivers/counter/counter-chrdev.c
+++ b/drivers/counter/counter-chrdev.c
@@ -487,6 +487,7 @@ static int counter_get_data(struct counter_device *const counter,
 	case COUNTER_COMP_ENUM:
 	case COUNTER_COMP_COUNT_DIRECTION:
 	case COUNTER_COMP_COUNT_MODE:
+	case COUNTER_COMP_SIGNAL_POLARITY:
 		switch (comp_node->component.scope) {
 		case COUNTER_SCOPE_DEVICE:
 			ret = comp->device_u32_read(counter, &value_u32);
diff --git a/drivers/counter/counter-sysfs.c b/drivers/counter/counter-sysfs.c
index 04eac41dad33..e5dd36e1a45f 100644
--- a/drivers/counter/counter-sysfs.c
+++ b/drivers/counter/counter-sysfs.c
@@ -91,6 +91,11 @@ static const char *const counter_count_mode_str[] = {
 	[COUNTER_COUNT_MODE_MODULO_N] = "modulo-n"
 };
 
+static const char *const counter_signal_polarity_str[] = {
+	[COUNTER_SIGNAL_POLARITY_POSITIVE] = "positive",
+	[COUNTER_SIGNAL_POLARITY_NEGATIVE] = "negative"
+};
+
 static ssize_t counter_comp_u8_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
@@ -201,6 +206,8 @@ static ssize_t counter_comp_u32_show(struct device *dev,
 		return sysfs_emit(buf, "%s\n", counter_count_direction_str[data]);
 	case COUNTER_COMP_COUNT_MODE:
 		return sysfs_emit(buf, "%s\n", counter_count_mode_str[data]);
+	case COUNTER_COMP_SIGNAL_POLARITY:
+		return sysfs_emit(buf, "%s\n", counter_signal_polarity_str[data]);
 	default:
 		return sysfs_emit(buf, "%u\n", (unsigned int)data);
 	}
@@ -252,6 +259,10 @@ static ssize_t counter_comp_u32_store(struct device *dev,
 		err = counter_find_enum(&data, avail->enums, avail->num_items,
 					buf, counter_count_mode_str);
 		break;
+	case COUNTER_COMP_SIGNAL_POLARITY:
+		err = counter_find_enum(&data, avail->enums, avail->num_items,
+					buf, counter_signal_polarity_str);
+		break;
 	default:
 		err = kstrtou32(buf, 0, &data);
 		break;
@@ -469,6 +480,7 @@ static int counter_attr_create(struct device *const dev,
 	case COUNTER_COMP_ENUM:
 	case COUNTER_COMP_COUNT_DIRECTION:
 	case COUNTER_COMP_COUNT_MODE:
+	case COUNTER_COMP_SIGNAL_POLARITY:
 		if (comp->device_u32_read) {
 			dev_attr->attr.mode |= 0444;
 			dev_attr->show = counter_comp_u32_show;
diff --git a/include/linux/counter.h b/include/linux/counter.h
index a81234bc8ea8..b3fb6b68881a 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -31,6 +31,7 @@ enum counter_comp_type {
 	COUNTER_COMP_ENUM,
 	COUNTER_COMP_COUNT_DIRECTION,
 	COUNTER_COMP_COUNT_MODE,
+	COUNTER_COMP_SIGNAL_POLARITY,
 };
 
 /**
@@ -477,6 +478,15 @@ struct counter_available {
 #define COUNTER_COMP_FLOOR(_read, _write) \
 	COUNTER_COMP_COUNT_U64("floor", _read, _write)
 
+#define COUNTER_COMP_POLARITY(_read, _write, _available) \
+{ \
+	.type = COUNTER_COMP_SIGNAL_POLARITY, \
+	.name = "polarity", \
+	.signal_u32_read = (_read), \
+	.signal_u32_write = (_write), \
+	.priv = &(_available), \
+}
+
 #define COUNTER_COMP_PRESET(_read, _write) \
 	COUNTER_COMP_COUNT_U64("preset", _read, _write)
 
diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h
index 96c5ffd368ad..e9610e1944dc 100644
--- a/include/uapi/linux/counter.h
+++ b/include/uapi/linux/counter.h
@@ -153,4 +153,10 @@ enum counter_synapse_action {
 	COUNTER_SYNAPSE_ACTION_BOTH_EDGES,
 };
 
+/* Signal polarity values */
+enum counter_signal_polarity {
+	COUNTER_SIGNAL_POLARITY_POSITIVE,
+	COUNTER_SIGNAL_POLARITY_NEGATIVE,
+};
+
 #endif /* _UAPI_COUNTER_H_ */
-- 
cgit v1.2.3


From 45d2918520b2d8e640e4fb3fbf664dfb823dc520 Mon Sep 17 00:00:00 2001
From: William Breathitt Gray <william.gray@linaro.org>
Date: Tue, 27 Sep 2022 18:53:40 -0400
Subject: counter: Introduce the Count capture component

Some devices provide a latch function to save historic Count values.
This patch standardizes exposure of such functionality as Count capture
components. A COUNTER_COMP_CAPTURE macro is provided for driver authors
to define a capture component. A new event COUNTER_EVENT_CAPTURE is
introduced to represent Count value capture events.

Cc: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/c239572ab4208d0d6728136e82a88ad464369a7a.1664204990.git.william.gray@linaro.org/
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/r/3cebaa0b807a225eb277d771504fe6dba7269ffd.1664318353.git.william.gray@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-counter | 7 +++++++
 include/linux/counter.h                     | 3 +++
 include/uapi/linux/counter.h                | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-counter b/Documentation/ABI/testing/sysfs-bus-counter
index 2a996deabe9e..3eb6b063970a 100644
--- a/Documentation/ABI/testing/sysfs-bus-counter
+++ b/Documentation/ABI/testing/sysfs-bus-counter
@@ -4,6 +4,12 @@ Contact:	linux-iio@vger.kernel.org
 Description:
 		Count data of Count Y represented as a string.
 
+What:		/sys/bus/counter/devices/counterX/countY/capture
+KernelVersion:	6.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Historical capture of the Count Y count data.
+
 What:		/sys/bus/counter/devices/counterX/countY/ceiling
 KernelVersion:	5.2
 Contact:	linux-iio@vger.kernel.org
@@ -203,6 +209,7 @@ Description:
 		both edges:
 			Any state transition.
 
+What:		/sys/bus/counter/devices/counterX/countY/capture_component_id
 What:		/sys/bus/counter/devices/counterX/countY/ceiling_component_id
 What:		/sys/bus/counter/devices/counterX/countY/floor_component_id
 What:		/sys/bus/counter/devices/counterX/countY/count_mode_component_id
diff --git a/include/linux/counter.h b/include/linux/counter.h
index b3fb6b68881a..e160197971dd 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -453,6 +453,9 @@ struct counter_available {
 	.priv = &(_available), \
 }
 
+#define COUNTER_COMP_CAPTURE(_read, _write) \
+	COUNTER_COMP_COUNT_U64("capture", _read, _write)
+
 #define COUNTER_COMP_CEILING(_read, _write) \
 	COUNTER_COMP_COUNT_U64("ceiling", _read, _write)
 
diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h
index e9610e1944dc..8ab12d731e3b 100644
--- a/include/uapi/linux/counter.h
+++ b/include/uapi/linux/counter.h
@@ -63,6 +63,8 @@ enum counter_event_type {
 	COUNTER_EVENT_INDEX,
 	/* State of counter is changed */
 	COUNTER_EVENT_CHANGE_OF_STATE,
+	/* Count value captured */
+	COUNTER_EVENT_CAPTURE,
 };
 
 /**
-- 
cgit v1.2.3


From 9cda70f622cdcf049521a9c2886e5fd8a90a0591 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Fri, 30 Sep 2022 11:57:39 +0530
Subject: io_uring: introduce fixed buffer support for io_uring_cmd

Add IORING_URING_CMD_FIXED flag that is to be used for sending io_uring
command with previously registered buffers. User-space passes the buffer
index in sqe->buf_index, same as done in read/write variants that uses
fixed buffers.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20220930062749.152261-3-anuj20.g@samsung.com
[axboe: shuffle valid flags check before acting on it]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h      |  2 +-
 include/uapi/linux/io_uring.h |  9 +++++++++
 io_uring/uring_cmd.c          | 19 ++++++++++++++++++-
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 1dbf51115c30..e10c5cc81082 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -28,7 +28,7 @@ struct io_uring_cmd {
 		void *cookie;
 	};
 	u32		cmd_op;
-	u32		pad;
+	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
 };
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92f29d9505a6..ab7458033ee3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -56,6 +56,7 @@ struct io_uring_sqe {
 		__u32		hardlink_flags;
 		__u32		xattr_flags;
 		__u32		msg_ring_flags;
+		__u32		uring_cmd_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -219,6 +220,14 @@ enum io_uring_op {
 	IORING_OP_LAST,
 };
 
+/*
+ * sqe->uring_cmd_flags
+ * IORING_URING_CMD_FIXED	use registered buffer; pass thig flag
+ *				along with setting sqe->buf_index.
+ */
+#define IORING_URING_CMD_FIXED	(1U << 0)
+
+
 /*
  * sqe->fsync_flags
  */
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 6a6d69523d75..e50de0b6b9f8 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -4,6 +4,7 @@
 #include <linux/file.h>
 #include <linux/io_uring.h>
 #include <linux/security.h>
+#include <linux/nospec.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -77,8 +78,24 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 
-	if (sqe->rw_flags || sqe->__pad1)
+	if (sqe->__pad1)
 		return -EINVAL;
+
+	ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
+	if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+		return -EINVAL;
+
+	if (ioucmd->flags & IORING_URING_CMD_FIXED) {
+		struct io_ring_ctx *ctx = req->ctx;
+		u16 index;
+
+		req->buf_index = READ_ONCE(sqe->buf_index);
+		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+			return -EFAULT;
+		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
+		req->imu = ctx->user_bufs[index];
+		io_req_set_rsrc_node(req, ctx, 0);
+	}
 	ioucmd->cmd = sqe->cmd;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 	return 0;
-- 
cgit v1.2.3


From 18ff0bcda6d1dd3d53b4ce3f03e61bf1a648f960 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:52:00 +0200
Subject: ethtool: add interface to interact with Ethernet Power Equipment

Add interface to support Power Sourcing Equipment. At current step it
provides generic way to address all variants of PSE devices as defined
in IEEE 802.3-2018 but support only objects specified for IEEE 802.3-2018 104.4
PoDL Power Sourcing Equipment (PSE).

Currently supported and mandatory objects are:
IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus
IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl

This is minimal interface needed to control PSE on each separate
ethernet port but it provides not all mandatory objects specified in
IEEE 802.3-2018.

Since "PoDL PSE" and "PSE" have similar names, but some different values
I decide to not merge them and keep separate naming schema. This should
allow as to be as close to IEEE 802.3 spec as possible and avoid name
conflicts in the future.

This implementation is connected to PHYs instead of MACs because PSE
auto classification can potentially interfere with PHY auto negotiation.
So, may be some extra PHY related initialization will be needed.

With WIP version of ethtools interaction with PSE capable link looks
as following:

$ ip l
...
5: t1l1@eth0: <BROADCAST,MULTICAST> ..
...

$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: disabled
PoDL PSE Power Detection Status: disabled

$ ethtool --set-pse t1l1 podl-pse-admin-control enable
$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: enabled
PoDL PSE Power Detection Status: delivering power

Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst |  59 +++++++++
 drivers/net/pse-pd/pse_core.c                |  58 +++++++++
 include/linux/pse-pd/pse.h                   |  62 +++++++++
 include/uapi/linux/ethtool.h                 |  45 +++++++
 include/uapi/linux/ethtool_netlink.h         |  16 +++
 net/ethtool/Makefile                         |   3 +-
 net/ethtool/common.h                         |   1 +
 net/ethtool/netlink.c                        |  17 +++
 net/ethtool/netlink.h                        |   4 +
 net/ethtool/pse-pd.c                         | 185 +++++++++++++++++++++++++++
 10 files changed, 449 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/pse-pd.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 09fb1d5ba67f..d578b8bcd8a4 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -220,6 +220,8 @@ Userspace to kernel:
   ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ``ETHTOOL_MSG_MODULE_SET``            set transceiver module parameters
   ``ETHTOOL_MSG_MODULE_GET``            get transceiver module parameters
+  ``ETHTOOL_MSG_PSE_SET``               set PSE parameters
+  ``ETHTOOL_MSG_PSE_GET``               get PSE parameters
   ===================================== =================================
 
 Kernel to userspace:
@@ -260,6 +262,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
   ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ``ETHTOOL_MSG_MODULE_GET_REPLY``         transceiver module parameters
+  ``ETHTOOL_MSG_PSE_GET_REPLY``            PSE parameters
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1627,6 +1630,62 @@ For SFF-8636 modules, low power mode is forced by the host according to table
 For CMIS modules, low power mode is forced by the host according to table 6-12
 in revision 5.0 of the specification.
 
+PSE_GET
+=======
+
+Gets PSE attributes.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_PSE_HEADER``               nested  request header
+  =====================================  ======  ==========================
+
+Kernel response contents:
+
+  ======================================  ======  =============================
+  ``ETHTOOL_A_PSE_HEADER``                nested  reply header
+  ``ETHTOOL_A_PODL_PSE_ADMIN_STATE``         u32  Operational state of the PoDL
+                                                  PSE functions
+  ``ETHTOOL_A_PODL_PSE_PW_D_STATUS``         u32  power detection status of the
+                                                  PoDL PSE.
+  ======================================  ======  =============================
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies
+the operational state of the PoDL PSE functions.  The operational state of the
+PSE function can be changed using the ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL``
+action. This option is corresponding to ``IEEE 802.3-2018`` 30.15.1.1.2
+aPoDLPSEAdminState. Possible values are:
+
+.. kernel-doc:: include/uapi/linux/ethtool.h
+    :identifiers: ethtool_podl_pse_admin_state
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_PW_D_STATUS`` attribute identifies
+the power detection status of the PoDL PSE.  The status depend on internal PSE
+state machine and automatic PD classification support. This option is
+corresponding to ``IEEE 802.3-2018`` 30.15.1.1.3 aPoDLPSEPowerDetectionStatus.
+Possible values are:
+
+.. kernel-doc:: include/uapi/linux/ethtool.h
+    :identifiers: ethtool_podl_pse_pw_d_status
+
+PSE_SET
+=======
+
+Sets PSE parameters.
+
+Request contents:
+
+  ======================================  ======  =============================
+  ``ETHTOOL_A_PSE_HEADER``                nested  request header
+  ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL``       u32  Control PoDL PSE Admin state
+  ======================================  ======  =============================
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL`` attribute is used
+to control PoDL PSE Admin functions. This option is implementing
+``IEEE 802.3-2018`` 30.15.1.2.1 acPoDLPSEAdminControl. See
+``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` for supported values.
+
 Request translation
 ===================
 
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index f431159fcc0b..146b81f08a89 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -254,3 +254,61 @@ out:
 	return psec;
 }
 EXPORT_SYMBOL_GPL(of_pse_control_get);
+
+/**
+ * pse_ethtool_get_status - get status of PSE control
+ * @psec: PSE control pointer
+ * @extack: extack for reporting useful error messages
+ * @status: struct to store PSE status
+ */
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	const struct pse_controller_ops *ops;
+	int err;
+
+	ops = psec->pcdev->ops;
+
+	if (!ops->ethtool_get_status) {
+		NL_SET_ERR_MSG(extack,
+			       "PSE driver does not support status report");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&psec->pcdev->lock);
+	err = ops->ethtool_get_status(psec->pcdev, psec->id, extack, status);
+	mutex_unlock(&psec->pcdev->lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pse_ethtool_get_status);
+
+/**
+ * pse_ethtool_set_config - set PSE control configuration
+ * @psec: PSE control pointer
+ * @extack: extack for reporting useful error messages
+ * @config: Configuration of the test to run
+ */
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	const struct pse_controller_ops *ops;
+	int err;
+
+	ops = psec->pcdev->ops;
+
+	if (!ops->ethtool_set_config) {
+		NL_SET_ERR_MSG(extack,
+			       "PSE driver does not configuration");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&psec->pcdev->lock);
+	err = ops->ethtool_set_config(psec->pcdev, psec->id, extack, config);
+	mutex_unlock(&psec->pcdev->lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pse_ethtool_set_config);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 3ba787a48b15..fd1a916eeeba 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -9,6 +9,47 @@
 #include <linux/list.h>
 #include <uapi/linux/ethtool.h>
 
+struct phy_device;
+struct pse_controller_dev;
+
+/**
+ * struct pse_control_config - PSE control/channel configuration.
+ *
+ * @admin_cotrol: set PoDL PSE admin control as described in
+ *	IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl
+ */
+struct pse_control_config {
+	enum ethtool_podl_pse_admin_state admin_cotrol;
+};
+
+/**
+ * struct pse_control_status - PSE control/channel status.
+ *
+ * @podl_admin_state: operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @podl_pw_status: power detection status of the PoDL PSE.
+ *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ */
+struct pse_control_status {
+	enum ethtool_podl_pse_admin_state podl_admin_state;
+	enum ethtool_podl_pse_pw_d_status podl_pw_status;
+};
+
+/**
+ * struct pse_controller_ops - PSE controller driver callbacks
+ *
+ * @ethtool_get_status: get PSE control status for ethtool interface
+ * @ethtool_set_config: set PSE control configuration over ethtool interface
+ */
+struct pse_controller_ops {
+	int (*ethtool_get_status)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		struct pse_control_status *status);
+	int (*ethtool_set_config)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		const struct pse_control_config *config);
+};
+
 struct module;
 struct device_node;
 struct of_phandle_args;
@@ -51,6 +92,13 @@ int devm_pse_controller_register(struct device *dev,
 struct pse_control *of_pse_control_get(struct device_node *node);
 void pse_control_put(struct pse_control *psec);
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status);
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config);
+
 #else
 
 static inline struct pse_control *of_pse_control_get(struct device_node *node)
@@ -62,6 +110,20 @@ static inline void pse_control_put(struct pse_control *psec)
 {
 }
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	return -ENOTSUPP;
+}
+
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	return -ENOTSUPP;
+}
+
 #endif
 
 #endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index fe9893d1485d..dc2aa3d75b39 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -736,6 +736,51 @@ enum ethtool_module_power_mode {
 	ETHTOOL_MODULE_POWER_MODE_HIGH,
 };
 
+/**
+ * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are
+ * 	unknown
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled
+ */
+enum ethtool_podl_pse_admin_state {
+	ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1,
+	ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+	ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED,
+};
+
+/**
+ * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE.
+ *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is
+ *	asserted true when the PoDL PSE state diagram variable mr_pse_enable is
+ *	false"
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is
+ *	asserted true when either of the PSE state diagram variables
+ *	pi_detecting or pi_classifying is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower”
+ *	is asserted true when the PoDL PSE state diagram variable pi_powered is
+ *	true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted
+ *	true when the PoDL PSE state diagram variable pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true
+ *	when the logical combination of the PoDL PSE state diagram variables
+ *	pi_prebiased*!pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted
+ *	true when the PoDL PSE state diagram variable overload_held is true."
+ */
+enum ethtool_podl_pse_pw_d_status {
+	ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR,
+};
+
 /**
  * struct ethtool_gstrings - string set for data tagging
  * @cmd: Command number = %ETHTOOL_GSTRINGS
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 408a664fad59..bb57084ac524 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -49,6 +49,8 @@ enum {
 	ETHTOOL_MSG_PHC_VCLOCKS_GET,
 	ETHTOOL_MSG_MODULE_GET,
 	ETHTOOL_MSG_MODULE_SET,
+	ETHTOOL_MSG_PSE_GET,
+	ETHTOOL_MSG_PSE_SET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -94,6 +96,7 @@ enum {
 	ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 	ETHTOOL_MSG_MODULE_GET_REPLY,
 	ETHTOOL_MSG_MODULE_NTF,
+	ETHTOOL_MSG_PSE_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -863,6 +866,19 @@ enum {
 	ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1)
 };
 
+/* Power Sourcing Equipment */
+enum {
+	ETHTOOL_A_PSE_UNSPEC,
+	ETHTOOL_A_PSE_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_PODL_PSE_ADMIN_STATE,		/* u32 */
+	ETHTOOL_A_PODL_PSE_ADMIN_CONTROL,	/* u32 */
+	ETHTOOL_A_PODL_PSE_PW_D_STATUS,		/* u32 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_PSE_CNT,
+	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index b76432e70e6b..72ab0944262a 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o
+		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \
+		   pse-pd.o
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 2dc2b80aea5f..c1779657e074 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -46,6 +46,7 @@ int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max);
 int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info);
 
 extern const struct ethtool_phy_ops *ethtool_phy_ops;
+extern const struct ethtool_pse_ops *ethtool_pse_ops;
 
 int ethtool_get_module_info_call(struct net_device *dev,
 				 struct ethtool_modinfo *modinfo);
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index f4e41a6e0163..1a4c11356c96 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -286,6 +286,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
 	[ETHTOOL_MSG_PHC_VCLOCKS_GET]	= &ethnl_phc_vclocks_request_ops,
 	[ETHTOOL_MSG_MODULE_GET]	= &ethnl_module_request_ops,
+	[ETHTOOL_MSG_PSE_GET]		= &ethnl_pse_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -1023,6 +1024,22 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_module_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_PSE_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_pse_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
+	},
+	{
+		.cmd	= ETHTOOL_MSG_PSE_SET,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_set_pse,
+		.policy = ethnl_pse_set_policy,
+		.maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index c0d587611854..1bfd374f9718 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
 extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 extern const struct ethnl_request_ops ethnl_module_request_ops;
+extern const struct ethnl_request_ops ethnl_pse_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -383,6 +384,8 @@ extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1
 extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1];
 extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1];
+extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
+extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -402,6 +405,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_module(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
new file mode 100644
index 000000000000..5a471e115b66
--- /dev/null
+++ b/net/ethtool/pse-pd.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// ethtool interface for for Ethernet PSE (Power Sourcing Equipment)
+// and PD (Powered Device)
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include "common.h"
+#include "linux/pse-pd/pse.h"
+#include "netlink.h"
+#include <linux/ethtool_netlink.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+
+struct pse_req_info {
+	struct ethnl_req_info base;
+};
+
+struct pse_reply_data {
+	struct ethnl_reply_data	base;
+	struct pse_control_status status;
+};
+
+#define PSE_REPDATA(__reply_base) \
+	container_of(__reply_base, struct pse_reply_data, base)
+
+/* PSE_GET */
+
+const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = {
+	[ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int pse_get_pse_attributes(struct net_device *dev,
+				  struct netlink_ext_ack *extack,
+				  struct pse_reply_data *data)
+{
+	struct phy_device *phydev = dev->phydev;
+
+	if (!phydev) {
+		NL_SET_ERR_MSG(extack, "No PHY is attached");
+		return -EOPNOTSUPP;
+	}
+
+	if (!phydev->psec) {
+		NL_SET_ERR_MSG(extack, "No PSE is attached");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&data->status, 0, sizeof(data->status));
+
+	return pse_ethtool_get_status(phydev->psec, extack, &data->status);
+}
+
+static int pse_prepare_data(const struct ethnl_req_info *req_base,
+			       struct ethnl_reply_data *reply_base,
+			       struct genl_info *info)
+{
+	struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+
+	ret = pse_get_pse_attributes(dev, info->extack, data);
+
+	ethnl_ops_complete(dev);
+
+	return ret;
+}
+
+static int pse_reply_size(const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	const struct pse_control_status *st = &data->status;
+	int len = 0;
+
+	if (st->podl_admin_state > 0)
+		len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */
+	if (st->podl_pw_status > 0)
+		len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */
+
+	return len;
+}
+
+static int pse_fill_reply(struct sk_buff *skb,
+			  const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	const struct pse_control_status *st = &data->status;
+
+	if (st->podl_admin_state > 0 &&
+	    nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
+			st->podl_admin_state))
+		return -EMSGSIZE;
+
+	if (st->podl_pw_status > 0 &&
+	    nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS,
+			st->podl_pw_status))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+const struct ethnl_request_ops ethnl_pse_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_PSE_GET,
+	.reply_cmd		= ETHTOOL_MSG_PSE_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_PSE_HEADER,
+	.req_info_size		= sizeof(struct pse_req_info),
+	.reply_data_size	= sizeof(struct pse_reply_data),
+
+	.prepare_data		= pse_prepare_data,
+	.reply_size		= pse_reply_size,
+	.fill_reply		= pse_fill_reply,
+};
+
+/* PSE_SET */
+
+const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = {
+	[ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] =
+		NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+				 ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED),
+};
+
+static int pse_set_pse_config(struct net_device *dev,
+			      struct netlink_ext_ack *extack,
+			      struct nlattr **tb)
+{
+	struct phy_device *phydev = dev->phydev;
+	struct pse_control_config config = {};
+
+	/* Optional attribute. Do not return error if not set. */
+	if (!tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL])
+		return 0;
+
+	/* this values are already validated by the ethnl_pse_set_policy */
+	config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
+
+	if (!phydev) {
+		NL_SET_ERR_MSG(extack, "No PHY is attached");
+		return -EOPNOTSUPP;
+	}
+
+	if (!phydev->psec) {
+		NL_SET_ERR_MSG(extack, "No PSE is attached");
+		return -EOPNOTSUPP;
+	}
+
+	return pse_ethtool_set_config(phydev->psec, extack, &config);
+}
+
+int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ethnl_req_info req_info = {};
+	struct nlattr **tb = info->attrs;
+	struct net_device *dev;
+	int ret;
+
+	ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PSE_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+
+	dev = req_info.dev;
+
+	rtnl_lock();
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto out_rtnl;
+
+	ret = pse_set_pse_config(dev, info->extack, tb);
+	ethnl_ops_complete(dev);
+out_rtnl:
+	rtnl_unlock();
+
+	ethnl_parse_header_dev_put(&req_info);
+
+	return ret;
+}
-- 
cgit v1.2.3


From 90fea5a800c3dd80fb8ad9a02929bcef5fde42b8 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 27 Sep 2022 15:48:08 +0800
Subject: vdpa: device feature provisioning

This patch allows the device features to be provisioned through
netlink. A new attribute is introduced to allow the userspace to pass
a 64bit device features during device adding.

This provides several advantages:

- Allow to provision a subset of the features to ease the cross vendor
  live migration.
- Better debug-ability for vDPA framework and parent.

Reviewed-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220927074810.28627-2-jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 5 +++++
 include/linux/vdpa.h      | 1 +
 include/uapi/linux/vdpa.h | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index c06c02704461..278e26bfa492 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -600,6 +600,11 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i
 		}
 		config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP);
 	}
+	if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) {
+		config.device_features =
+			nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]);
+		config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES);
+	}
 
 	/* Skip checking capability if user didn't prefer to configure any
 	 * device networking attributes. It is likely that user might have used
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index d282f464d2f1..6d0f5e4e82c2 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -104,6 +104,7 @@ struct vdpa_iova_range {
 };
 
 struct vdpa_dev_set_config {
+	u64 device_features;
 	struct {
 		u8 mac[ETH_ALEN];
 		u16 mtu;
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 25c55cab3d7c..9dc855f37c59 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -52,6 +52,8 @@ enum vdpa_attr {
 	VDPA_ATTR_DEV_VENDOR_ATTR_NAME,		/* string */
 	VDPA_ATTR_DEV_VENDOR_ATTR_VALUE,        /* u64 */
 
+	VDPA_ATTR_DEV_FEATURES,                 /* u64 */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3


From e60d64074214db7207fc13c25ee39d8d47cb4a34 Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Wed, 21 Sep 2022 11:27:29 +0300
Subject: virtio_blk: add SECURE ERASE command support

Support for the VIRTIO_BLK_F_SECURE_ERASE VirtIO feature.

A device that offers this feature can receive VIRTIO_BLK_T_SECURE_ERASE
commands.

A device which supports this feature has the following fields in the
virtio config:

- max_secure_erase_sectors
- max_secure_erase_seg
- secure_erase_sector_alignment

max_secure_erase_sectors and secure_erase_sector_alignment are expressed
in 512-byte units.

Every secure erase command has the following fields:

- sectors: The starting offset in 512-byte units.
- num_sectors: The number of sectors.

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Message-Id: <20220921082729.2516779-1-alvaro.karsz@solid-run.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 drivers/block/virtio_blk.c      | 110 +++++++++++++++++++++++++++++++++-------
 include/uapi/linux/virtio_blk.h |  19 +++++++
 2 files changed, 111 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dd9a05174726..f03505a65d08 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -130,7 +130,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
+static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool unmap)
 {
 	unsigned short segments = blk_rq_nr_discard_segments(req);
 	unsigned short n = 0;
@@ -240,6 +240,9 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 		type = VIRTIO_BLK_T_WRITE_ZEROES;
 		unmap = !(req->cmd_flags & REQ_NOUNMAP);
 		break;
+	case REQ_OP_SECURE_ERASE:
+		type = VIRTIO_BLK_T_SECURE_ERASE;
+		break;
 	case REQ_OP_DRV_IN:
 		type = VIRTIO_BLK_T_GET_ID;
 		break;
@@ -251,8 +254,9 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
 	vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
 	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
 
-	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
-		if (virtblk_setup_discard_write_zeroes(req, unmap))
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES ||
+	    type == VIRTIO_BLK_T_SECURE_ERASE) {
+		if (virtblk_setup_discard_write_zeroes_erase(req, unmap))
 			return BLK_STS_RESOURCE;
 	}
 
@@ -888,6 +892,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 	int err, index;
 
 	u32 v, blk_size, max_size, sg_elems, opt_io_size;
+	u32 max_discard_segs = 0;
+	u32 discard_granularity = 0;
 	u16 min_io_size;
 	u8 physical_block_exp, alignment_offset;
 	unsigned int queue_depth;
@@ -1045,27 +1051,14 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
 		virtio_cread(vdev, struct virtio_blk_config,
-			     discard_sector_alignment, &v);
-		if (v)
-			q->limits.discard_granularity = v << SECTOR_SHIFT;
-		else
-			q->limits.discard_granularity = blk_size;
+			     discard_sector_alignment, &discard_granularity);
 
 		virtio_cread(vdev, struct virtio_blk_config,
 			     max_discard_sectors, &v);
 		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
 
 		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
-			     &v);
-
-		/*
-		 * max_discard_seg == 0 is out of spec but we always
-		 * handled it.
-		 */
-		if (!v)
-			v = sg_elems;
-		blk_queue_max_discard_segments(q,
-					       min(v, MAX_DISCARD_SEGMENTS));
+			     &max_discard_segs);
 	}
 
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
@@ -1074,6 +1067,85 @@ static int virtblk_probe(struct virtio_device *vdev)
 		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
 	}
 
+	/* The discard and secure erase limits are combined since the Linux
+	 * block layer uses the same limit for both commands.
+	 *
+	 * If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
+	 * are negotiated, we will use the minimum between the limits.
+	 *
+	 * discard sector alignment is set to the minimum between discard_sector_alignment
+	 * and secure_erase_sector_alignment.
+	 *
+	 * max discard sectors is set to the minimum between max_discard_seg and
+	 * max_secure_erase_seg.
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     secure_erase_sector_alignment, &v);
+
+		/* secure_erase_sector_alignment should not be zero, the device should set a
+		 * valid number of sectors.
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: secure_erase_sector_alignment can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		discard_granularity = min_not_zero(discard_granularity, v);
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_secure_erase_sectors, &v);
+
+		/* max_secure_erase_sectors should not be zero, the device should set a
+		 * valid number of sectors.
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: max_secure_erase_sectors can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		blk_queue_max_secure_erase_sectors(q, v);
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_secure_erase_seg, &v);
+
+		/* max_secure_erase_seg should not be zero, the device should set a
+		 * valid number of segments
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: max_secure_erase_seg can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		max_discard_segs = min_not_zero(max_discard_segs, v);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
+	    virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
+		/* max_discard_seg and discard_granularity will be 0 only
+		 * if max_discard_seg and discard_sector_alignment fields in the virtio
+		 * config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
+		 * In this case, we use default values.
+		 */
+		if (!max_discard_segs)
+			max_discard_segs = sg_elems;
+
+		blk_queue_max_discard_segments(q,
+					       min(max_discard_segs, MAX_DISCARD_SEGMENTS));
+
+		if (discard_granularity)
+			q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT;
+		else
+			q->limits.discard_granularity = blk_size;
+	}
+
 	virtblk_update_capacity(vblk, false);
 	virtio_device_ready(vdev);
 
@@ -1169,6 +1241,7 @@ static unsigned int features_legacy[] = {
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
+	VIRTIO_BLK_F_SECURE_ERASE,
 }
 ;
 static unsigned int features[] = {
@@ -1176,6 +1249,7 @@ static unsigned int features[] = {
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
+	VIRTIO_BLK_F_SECURE_ERASE,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index d888f013d9ff..58e70b24b504 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_SECURE_ERASE	16 /* Secure Erase is supported */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -121,6 +122,21 @@ struct virtio_blk_config {
 	__u8 write_zeroes_may_unmap;
 
 	__u8 unused1[3];
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_SECURE_ERASE */
+	/*
+	 * The maximum secure erase sectors (in 512-byte sectors) for
+	 * one segment.
+	 */
+	__virtio32 max_secure_erase_sectors;
+	/*
+	 * The maximum number of secure erase segments in a
+	 * secure erase command.
+	 */
+	__virtio32 max_secure_erase_seg;
+	/* Secure erase commands must be aligned to this number of sectors. */
+	__virtio32 secure_erase_sector_alignment;
+
 } __attribute__((packed));
 
 /*
@@ -155,6 +171,9 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES	13
 
+/* Secure erase command */
+#define VIRTIO_BLK_T_SECURE_ERASE	14
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
-- 
cgit v1.2.3


From 228565100def593df0f26ee07d5fb810039454d5 Mon Sep 17 00:00:00 2001
From: Zhu Lingshan <lingshan.zhu@intel.com>
Date: Thu, 29 Sep 2022 09:45:50 +0800
Subject: vDPA: allow userspace to query features of a vDPA device

This commit adds a new vDPA netlink attribution
VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES. Userspace can query
features of vDPA devices through this new attr.

This commit invokes vdpa_config_ops.get_config()
rather than vdpa_get_config_unlocked() to read
the device config spcae, so no races in
vdpa_set_features_unlocked()

Userspace tool iproute2 example:
$ vdpa dev config show vdpa0
vdpa0: mac 00:e8:ca:11:be:05 link up link_announce false max_vq_pairs 4 mtu 1500
  negotiated_features MRG_RXBUF CTRL_VQ MQ VERSION_1 ACCESS_PLATFORM
  dev_features MTU MAC MRG_RXBUF CTRL_VQ MQ ANY_LAYOUT VERSION_1 ACCESS_PLATFORM

Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
Message-Id: <20220929014555.112323-2-lingshan.zhu@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 16 +++++++++++-----
 include/uapi/linux/vdpa.h |  4 ++++
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 278e26bfa492..d0a7b76d9163 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -820,10 +820,10 @@ static int vdpa_dev_net_mq_config_fill(struct vdpa_device *vdev,
 static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg)
 {
 	struct virtio_net_config config = {};
-	u64 features;
+	u64 features_device, features_driver;
 	u16 val_u16;
 
-	vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config));
+	vdev->config->get_config(vdev, 0, &config, sizeof(config));
 
 	if (nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config.mac),
 		    config.mac))
@@ -837,12 +837,18 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms
 	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16))
 		return -EMSGSIZE;
 
-	features = vdev->config->get_driver_features(vdev);
-	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features,
+	features_driver = vdev->config->get_driver_features(vdev);
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver,
+			      VDPA_ATTR_PAD))
+		return -EMSGSIZE;
+
+	features_device = vdev->config->get_device_features(vdev);
+
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES, features_device,
 			      VDPA_ATTR_PAD))
 		return -EMSGSIZE;
 
-	return vdpa_dev_net_mq_config_fill(vdev, msg, features, &config);
+	return vdpa_dev_net_mq_config_fill(vdev, msg, features_driver, &config);
 }
 
 static int
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 9dc855f37c59..9bd79235c875 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -46,6 +46,7 @@ enum vdpa_attr {
 
 	VDPA_ATTR_DEV_NEGOTIATED_FEATURES,	/* u64 */
 	VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,		/* u32 */
+	/* virtio features that are supported by the vDPA management device */
 	VDPA_ATTR_DEV_SUPPORTED_FEATURES,	/* u64 */
 
 	VDPA_ATTR_DEV_QUEUE_INDEX,              /* u32 */
@@ -54,6 +55,9 @@ enum vdpa_attr {
 
 	VDPA_ATTR_DEV_FEATURES,                 /* u64 */
 
+	/* virtio features that are supported by the vDPA device */
+	VDPA_ATTR_VDPA_DEV_SUPPORTED_FEATURES,	/* u64 */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3


From 0ec8ce07394442d722806fe61b901a5b2b17249d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 14 Oct 2022 15:25:41 -0700
Subject: dmaengine: idxd: Do not enable user type Work Queue without Shared
 Virtual Addressing

When the idxd_user_drv driver is bound to a Work Queue (WQ) device
without IOMMU or with IOMMU Passthrough without Shared Virtual
Addressing (SVA), the application gains direct access to physical
memory via the device by programming physical address to a submitted
descriptor. This allows direct userspace read and write access to
arbitrary physical memory. This is inconsistent with the security
goals of a good kernel API.

Unlike vfio_pci driver, the IDXD char device driver does not provide any
ways to pin user pages and translate the address from user VA to IOVA or
PA without IOMMU SVA. Therefore the application has no way to instruct the
device to perform DMA function. This makes the char device not usable for
normal application usage.

Since user type WQ without SVA cannot be used for normal application usage
and presents the security issue, bind idxd_user_drv driver and enable user
type WQ only when SVA is enabled (i.e. user PASID is enabled).

Fixes: 448c3de8ac83 ("dmaengine: idxd: create user driver for wq 'device'")
Cc: stable@vger.kernel.org
Suggested-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/20221014222541.3912195-1-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/cdev.c   | 18 ++++++++++++++++++
 include/uapi/linux/idxd.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c2808fd081d6..a9b96b18772f 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -312,6 +312,24 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -ENXIO;
 
+	/*
+	 * User type WQ is enabled only when SVA is enabled for two reasons:
+	 *   - If no IOMMU or IOMMU Passthrough without SVA, userspace
+	 *     can directly access physical address through the WQ.
+	 *   - The IDXD cdev driver does not provide any ways to pin
+	 *     user pages and translate the address from user VA to IOVA or
+	 *     PA without IOMMU SVA. Therefore the application has no way
+	 *     to instruct the device to perform DMA function. This makes
+	 *     the cdev not usable for normal application usage.
+	 */
+	if (!device_user_pasid_enabled(idxd)) {
+		idxd->cmd_status = IDXD_SCMD_WQ_USER_NO_IOMMU;
+		dev_dbg(&idxd->pdev->dev,
+			"User type WQ cannot be enabled without SVA.\n");
+
+		return -EOPNOTSUPP;
+	}
+
 	mutex_lock(&wq->wq_lock);
 	wq->type = IDXD_WQT_USER;
 	rc = drv_enable_wq(wq);
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 095299c75828..2b9e7feba3f3 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -29,6 +29,7 @@ enum idxd_scmd_stat {
 	IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
 	IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
 	IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
+	IDXD_SCMD_WQ_USER_NO_IOMMU = 0x80110000,
 };
 
 #define IDXD_SCMD_SOFTERR_MASK	0x80000000
-- 
cgit v1.2.3


From 8da7f0976b9071b528c545008de9d10cc81883b1 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 12 Oct 2022 16:46:17 +0100
Subject: media: videodev2.h: V4L2_DV_BT_BLANKING_HEIGHT should check
 'interlaced'

If it is a progressive (non-interlaced) format, then ignore the
interlaced timing values.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Fixes: 7f68127fa11f ([media] videodev2.h: defines to calculate blanking and frame sizes)
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/uapi/linux/videodev2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 86cae23cc446..29da1f4b4578 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1601,7 +1601,8 @@ struct v4l2_bt_timings {
 	((bt)->width + V4L2_DV_BT_BLANKING_WIDTH(bt))
 #define V4L2_DV_BT_BLANKING_HEIGHT(bt) \
 	((bt)->vfrontporch + (bt)->vsync + (bt)->vbackporch + \
-	 (bt)->il_vfrontporch + (bt)->il_vsync + (bt)->il_vbackporch)
+	 ((bt)->interlaced ? \
+	  ((bt)->il_vfrontporch + (bt)->il_vsync + (bt)->il_vbackporch) : 0))
 #define V4L2_DV_BT_FRAME_HEIGHT(bt) \
 	((bt)->height + V4L2_DV_BT_BLANKING_HEIGHT(bt))
 
-- 
cgit v1.2.3


From cb6c18b5a41622c7a439508f7421f8766a91cb87 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@amd.com>
Date: Sat, 1 Oct 2022 11:37:05 +0530
Subject: perf/mem: Rename PERF_MEM_LVLNUM_EXTN_MEM to PERF_MEM_LVLNUM_CXL

PERF_MEM_LVLNUM_EXTN_MEM was introduced to cover CXL devices but it's
bit ambiguous name and also not generic enough to cover cxl.cache and
cxl.io devices. Rename it to PERF_MEM_LVLNUM_CXL to be more specific.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/f6268268-b4e9-9ed6-0453-65792644d953@amd.com
---
 arch/x86/events/amd/ibs.c       | 2 +-
 include/uapi/linux/perf_event.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 3271735f0070..4cb710efbdd9 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -801,7 +801,7 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
 	/* Extension Memory */
 	if (ibs_caps & IBS_CAPS_ZEN4 &&
 	    ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
-		data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM;
+		data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
 		if (op_data2->rmt_node) {
 			data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
 			/* IBS doesn't provide Remote socket detail */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 85be78e0e7f6..ccb7f5dad59b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1337,7 +1337,7 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
 #define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
 /* 5-0x8 available */
-#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */
+#define PERF_MEM_LVLNUM_CXL	0x09 /* CXL */
 #define PERF_MEM_LVLNUM_IO	0x0a /* I/O */
 #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
 #define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
-- 
cgit v1.2.3


From 986d93f55bdeab1cac858d1e47b41fac10b2d7f6 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 31 Oct 2022 10:10:21 +0800
Subject: audit: fix undefined behavior in bit shift for AUDIT_BIT

Shifting signed 32-bit value by 31 bits is undefined, so changing
significant bit to unsigned. The UBSAN warning calltrace like below:

UBSAN: shift-out-of-bounds in kernel/auditfilter.c:179:23
left shift of 1 by 31 places cannot be represented in type 'int'
Call Trace:
 <TASK>
 dump_stack_lvl+0x7d/0xa5
 dump_stack+0x15/0x1b
 ubsan_epilogue+0xe/0x4e
 __ubsan_handle_shift_out_of_bounds+0x1e7/0x20c
 audit_register_class+0x9d/0x137
 audit_classes_init+0x4d/0xb8
 do_one_initcall+0x76/0x430
 kernel_init_freeable+0x3b3/0x422
 kernel_init+0x24/0x1e0
 ret_from_fork+0x1f/0x30
 </TASK>

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
[PM: remove bad 'Fixes' tag as issue predates git, added in v2.6.6-rc1]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 7c1dc818b1d5..d676ed2b246e 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -187,7 +187,7 @@
 #define AUDIT_MAX_KEY_LEN  256
 #define AUDIT_BITMASK_SIZE 64
 #define AUDIT_WORD(nr) ((__u32)((nr)/32))
-#define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
+#define AUDIT_BIT(nr)  (1U << ((nr) - AUDIT_WORD(nr)*32))
 
 #define AUDIT_SYSCALL_CLASSES 16
 #define AUDIT_CLASS_DIR_WRITE 0
-- 
cgit v1.2.3


From aec1dc972d27c837d1406310dab5170189eb01e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 2 Nov 2022 11:25:16 -0700
Subject: net/ipv4: Fix linux/in.h header dependencies

__DECLARE_FLEX_ARRAY is defined in include/uapi/linux/stddef.h but
doesn't seem to be explicitly included from include/uapi/linux/in.h,
which breaks BPF selftests builds (once we sync linux/stddef.h into
tools/include directory in the next patch). Fix this by explicitly
including linux/stddef.h.

Given this affects BPF CI and bpf tree, targeting this for bpf tree.

Fixes: 5854a09b4957 ("net/ipv4: Use __DECLARE_FLEX_ARRAY() helper")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20221102182517.2675301-1-andrii@kernel.org
---
 include/uapi/linux/in.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index f243ce665f74..07a4cb149305 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -20,6 +20,7 @@
 #define _UAPI_LINUX_IN_H
 
 #include <linux/types.h>
+#include <linux/stddef.h>
 #include <linux/libc-compat.h>
 #include <linux/socket.h>
 
-- 
cgit v1.2.3


From 46653972e3ea64f79e7f8ae3aa41a4d3fdb70a13 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 31 Oct 2022 19:25:36 +0800
Subject: capabilities: fix undefined behavior in bit shift for CAP_TO_MASK

Shifting signed 32-bit value by 31 bits is undefined, so changing
significant bit to unsigned. The UBSAN warning calltrace like below:

UBSAN: shift-out-of-bounds in security/commoncap.c:1252:2
left shift of 1 by 31 places cannot be represented in type 'int'
Call Trace:
 <TASK>
 dump_stack_lvl+0x7d/0xa5
 dump_stack+0x15/0x1b
 ubsan_epilogue+0xe/0x4e
 __ubsan_handle_shift_out_of_bounds+0x1e7/0x20c
 cap_task_prctl+0x561/0x6f0
 security_task_prctl+0x5a/0xb0
 __x64_sys_prctl+0x61/0x8f0
 do_syscall_64+0x58/0x80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 </TASK>

Fixes: e338d263a76a ("Add 64-bit capability support to the kernel")
Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/capability.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 463d1ba2232a..3d61a0ae055d 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -426,7 +426,7 @@ struct vfs_ns_cap_data {
  */
 
 #define CAP_TO_INDEX(x)     ((x) >> 5)        /* 1 << 5 == bits in __u32 */
-#define CAP_TO_MASK(x)      (1 << ((x) & 31)) /* mask for indexed __u32 */
+#define CAP_TO_MASK(x)      (1U << ((x) & 31)) /* mask for indexed __u32 */
 
 
 #endif /* _UAPI_LINUX_CAPABILITY_H */
-- 
cgit v1.2.3


From 6dcabcd398946e2b0b776a8310291aeebe1ca0e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 6 Nov 2022 13:17:27 -0700
Subject: io_uring: fix typo in io_uring.h comment

Just a basic s/thig/this swap, fixing up a typo introduced by a commit
added in the 6.1 release.

Fixes: 9cda70f622cd ("io_uring: introduce fixed buffer support for io_uring_cmd")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ab7458033ee3..2df3225b562f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -222,7 +222,7 @@ enum io_uring_op {
 
 /*
  * sqe->uring_cmd_flags
- * IORING_URING_CMD_FIXED	use registered buffer; pass thig flag
+ * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)
-- 
cgit v1.2.3


From 58e0be1ef6118c5352b56a4d06e974c5599993a5 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 15 Nov 2022 22:24:00 +0800
Subject: net: use struct_group to copy ip/ipv6 header addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel test robot reported warnings when build bonding module with
make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/net/bonding/:

                 from ../drivers/net/bonding/bond_main.c:35:
In function ‘fortify_memcpy_chk’,
    inlined from ‘iph_to_flow_copy_v4addrs’ at ../include/net/ip.h:566:2,
    inlined from ‘bond_flow_ip’ at ../drivers/net/bonding/bond_main.c:3984:3:
../include/linux/fortify-string.h:413:25: warning: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of f
ield (2nd parameter); maybe use struct_group()? [-Wattribute-warning]
  413 |                         __read_overflow2_field(q_size_field, size);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘fortify_memcpy_chk’,
    inlined from ‘iph_to_flow_copy_v6addrs’ at ../include/net/ipv6.h:900:2,
    inlined from ‘bond_flow_ip’ at ../drivers/net/bonding/bond_main.c:3994:3:
../include/linux/fortify-string.h:413:25: warning: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of f
ield (2nd parameter); maybe use struct_group()? [-Wattribute-warning]
  413 |                         __read_overflow2_field(q_size_field, size);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

This is because we try to copy the whole ip/ip6 address to the flow_key,
while we only point the to ip/ip6 saddr. Note that since these are UAPI
headers, __struct_group() is used to avoid the compiler warnings.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: c3f8324188fa ("net: Add full IPv6 addresses to flow_keys")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20221115142400.1204786-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip.h          | 2 +-
 include/net/ipv6.h        | 2 +-
 include/uapi/linux/ip.h   | 6 ++++--
 include/uapi/linux/ipv6.h | 6 ++++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/ip.h b/include/net/ip.h
index 038097c2a152..144bdfbb25af 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -563,7 +563,7 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
 	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
 		     offsetof(typeof(flow->addrs), v4addrs.src) +
 			      sizeof(flow->addrs.v4addrs.src));
-	memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs));
+	memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
 	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 }
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 37943ba3a73c..d383c895592a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -897,7 +897,7 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
 		     offsetof(typeof(flow->addrs), v6addrs.src) +
 		     sizeof(flow->addrs.v6addrs.src));
-	memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs));
+	memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
 	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 }
 
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 961ec16a26b8..874a92349bf5 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -100,8 +100,10 @@ struct iphdr {
 	__u8	ttl;
 	__u8	protocol;
 	__sum16	check;
-	__be32	saddr;
-	__be32	daddr;
+	__struct_group(/* no tag */, addrs, /* no attrs */,
+		__be32	saddr;
+		__be32	daddr;
+	);
 	/*The options start here. */
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 03cdbe798fe3..81f4243bebb1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -130,8 +130,10 @@ struct ipv6hdr {
 	__u8			nexthdr;
 	__u8			hop_limit;
 
-	struct	in6_addr	saddr;
-	struct	in6_addr	daddr;
+	__struct_group(/* no tag */, addrs, /* no attrs */,
+		struct	in6_addr	saddr;
+		struct	in6_addr	daddr;
+	);
 };
 
 
-- 
cgit v1.2.3